diff --git a/.github/ISSUE_TEMPLATE/release.md b/.github/ISSUE_TEMPLATE/release.md
index c8eccd90f8..70f9760cf5 100644
--- a/.github/ISSUE_TEMPLATE/release.md
+++ b/.github/ISSUE_TEMPLATE/release.md
@@ -13,7 +13,7 @@ assignees: ''
     - At this point, changes must be cherry-picked into the release branch in
       order for them to be included in the release.
 - [ ] MM/DD/YYYY - Release Candidate 1 (begin Ask Mode[^1] for release branch).
-    - At this point, changes must be approved by @microsoft/hlsl-release
+    - At this point, cherry-picked changes must be approved by @microsoft/hlsl-release
 - [ ] MM/DD/YYYY - Final Release Candidate
 - [ ] MM/DD/YYYY - Target Release Date
 
@@ -26,7 +26,7 @@ This part of the release process is to 'prime the pump' - that is to make sure
 that all the various parts of the engineering system are set into place so that
 we are confident we can generate builds for the new branch
 
-- [ ] Update version number
+- [ ] Update version numbers in utils/version/latest-release.json and utils/version/version.inc
 - [ ] Create the release branch from `main`
     - The release branch is kept into sync with main via regular fast-forward
       merges.
@@ -39,7 +39,7 @@ we are confident we can generate builds for the new branch
 
 ## After Fork
 
-- [ ] Update README.md
+- [ ] Update README.md if necessary
 - [ ] Create draft of Release post on GitHub
 
 ## Quality Sign Off
diff --git a/README.md b/README.md
index a06ee5ab97..35c0132068 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ At the moment, the DirectX HLSL Compiler provides the following components:
 
 - dxilconv.dll, a DLL providing a converter from DXBC (older shader bytecode format)
 
+- dxv.exe, a command-line tool that validates DXIL IR (compiled HLSL programs). 
+
 - various other tools based on the above components
 
 The Microsoft Windows SDK releases include a supported version of the compiler and validator.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 027616765c..33c5349f9e 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -1,6 +1,10 @@
 trigger:
   - main
   - release*
+
+pr: 
+  - main
+  - release*
   
 resources:
 - repo: self
@@ -9,7 +13,7 @@ stages:
 - stage: Build
   jobs:
   - job: Windows
-    timeoutInMinutes: 90
+    timeoutInMinutes: 120
 
     pool:
       vmImage: windows-2022
@@ -38,9 +42,13 @@ stages:
         call utils\hct\hctstart.cmd %HLSL_SRC_DIR% %HLSL_BLD_DIR%
         call utils\hct\hcttest.cmd -$(configuration) noexec
       displayName: 'DXIL Tests'
+    - script: |
+        call utils\hct\hctstart.cmd %HLSL_SRC_DIR% %HLSL_BLD_DIR%
+        call utils\hct\hcttest.cmd -$(configuration) exec-warp
+      displayName: 'DXIL Execution Tests (Nuget WARP)'
 
   - job: Nix
-    timeoutInMinutes: 90
+    timeoutInMinutes: 120
 
     variables:
       macOS: macOS-latest
diff --git a/cmake/modules/Nuget.cmake b/cmake/modules/Nuget.cmake
new file mode 100644
index 0000000000..5c8aaea337
--- /dev/null
+++ b/cmake/modules/Nuget.cmake
@@ -0,0 +1,233 @@
+include_guard(GLOBAL)
+
+if(NOT DEFINED BINARY_DIR)
+    message(SEND_ERROR "Callers must provide BINARY_DIR")
+endif()
+
+if(NOT DEFINED BUILD_TYPE)
+    message(SEND_ERROR "Callers must provide BUILD_TYPE")
+endif()
+
+if(NOT DEFINED ENV{USE_WARP_FROM_NUGET})
+    message(SEND_ERROR "Callers must set a string value for the environment variable USE_WARP_FROM_NUGET."
+            "Either 'LATEST_RELEASE' or 'LATEST_PREVIEW'")
+endif()
+
+set(USE_WARP_FROM_NUGET $ENV{USE_WARP_FROM_NUGET})
+
+# Downloads nuget.exe to the given path if it doesn't exist yet.
+function(EnsureNugetExists target_path)
+    # Download the latest nuget.exe to the given path.
+    if(NOT EXISTS ${target_path})
+        message(STATUS "Installing nuget.exe to ${target_path}...")
+        file(DOWNLOAD 
+            https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+            ${target_path}
+        )
+    endif()
+endfunction()
+
+# Download the latest nuget package for the given ID. Can pass in a custom source, defaults to nuget public feed.
+function(GetNuGetPackageLatestVersion)
+    set(params NAME ID SOURCE OUTPUT_DIR OUTPUT_VARIABLE PREVIEW)
+    cmake_parse_arguments(PARSE_ARGV 0 ARG "" "${params}" "")
+
+    if(NOT ARG_OUTPUT_DIR)
+        set(ARG_OUTPUT_DIR )
+    endif()
+
+    set(nuget_exe_path "${ARG_OUTPUT_DIR}\\nuget.exe install")
+    EnsureNugetExists(${nuget_exe_path})
+
+    if (${ARG_ID}_LATEST_VERSION)
+        set(${ARG_OUTPUT_VARIABLE} ${${ARG_ID}_LATEST_VERSION} PARENT_SCOPE)
+    else()
+        if(NOT ARG_SOURCE)
+            set(ARG_SOURCE https://api.nuget.org/v3/index.json)
+        endif()
+
+        if(NOT ARG_PREVIEW)
+            set(ARG_PREVIEW OFF)
+        endif()
+
+        if(ARG_PREVIEW)
+            # Note that '-Prerelease' options will only return a prerelease package if that is also the latest.
+            # If you want a prerelease package with an older version number than the latest release package then you
+            # need to pass a specific version number.
+            message("Will add '-Prelease' to nuget list command")
+            set(prerelease "-Prerelease")
+        endif()
+
+        execute_process(
+            COMMAND ${nuget_exe_path} 
+            list ${ARG_ID}
+            -Source ${ARG_SOURCE}
+            ${prerelease}
+            RESULT_VARIABLE result
+            OUTPUT_VARIABLE nuget_list_output
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+
+        if(NOT ${result} STREQUAL "0")
+            message(FATAL_ERROR "NuGet failed to find latest version of package ${ARG_ID} with exit code ${result}.")
+        endif()
+
+        # Get last line of running nuget.exe list <ID>.
+        string(REPLACE "\n" ";" nuget_list_output ${nuget_list_output})
+        list(POP_BACK nuget_list_output nuget_list_last_line)
+        if(nuget_list_last_line STREQUAL "No packages found.")
+            message(FATAL_ERROR "NuGet failed to find latest version of package ${ARG_ID}.")
+        endif()
+
+        # The last line should have the format <ID> <VERSION>
+        string(REPLACE " " ";" nuget_list_last_line ${nuget_list_last_line})
+        list(POP_BACK nuget_list_last_line nuget_version)
+
+        if(NOT nuget_version)
+            message(FATAL_ERROR "NuGet failed to find latest version of package ${ARG_ID}.")
+        endif()
+
+        message("Nuget found version:${nuget_version} for ${ARG_ID}")
+
+        # Save output variable and cache the result so subsequent calls to the version-unspecified package 
+        # are faster.
+        set(${ARG_OUTPUT_VARIABLE} ${nuget_version} PARENT_SCOPE)
+        set(${ARG_ID}_LATEST_VERSION ${nuget_version} CACHE INTERNAL "")
+    endif()
+endfunction()
+
+# Installs a NuGet package under OUTPUT_DIR.
+#
+# FetchNuGetPackage(
+#     ID Microsoft.Direct3D.WARP
+#     VERSION 1.0.13
+#     SOURCE https://api.nuget.org/v3/index.json
+# )
+#
+# This function sets a variable <name>_SOURCE_DIR (e.g. Microsoft.Direct3D.WARP_SOURCE_DIR in above example) to the 
+# extract NuGet package contents.
+function(FetchNuGetPackage)
+    set(params NAME ID VERSION SOURCE OUTPUT_DIR RELEASE_TYPE)
+    cmake_parse_arguments(PARSE_ARGV 0 ARG "" "${params}" "")
+
+    # The NAME parameter is optional: if it's not set then the package ID is used as the name. The 
+    # reason for having a separate NAME is to allow a consistent identifier for packages whose ID
+    # changes with each release (e.g. GDK).
+    if(NOT ARG_NAME)
+        set(ARG_NAME ${ARG_ID})
+    endif()
+
+    if(NOT ARG_OUTPUT_DIR)
+        set(ARG_OUTPUT_DIR ${BINARY_DIR}/temp)
+    endif()
+    
+    set(nuget_exe_path ${ARG_OUTPUT_DIR}/nuget.exe)
+
+    if(NOT ARG_SOURCE)
+        set(ARG_SOURCE https://api.nuget.org/v3/index.json)
+    endif()
+
+    if(NOT ARG_RELEASE_TYPE)
+        set(ARG_RELEASE_TYPE "LATEST_RELEASE")
+    endif()
+
+    set(PREVIEW OFF)
+    
+    if(${ARG_RELEASE_TYPE} STREQUAL "LATEST_PREVIEW")
+        set(PREVIEW ON)
+    endif()
+
+    # Default to latest version
+    if(NOT ARG_VERSION)
+        GetNuGetPackageLatestVersion(
+            ID ${ARG_ID} 
+            SOURCE ${ARG_SOURCE} 
+            PREVIEW ${PREVIEW}
+            OUTPUT_DIR ${ARG_OUTPUT_DIR} 
+            OUTPUT_VARIABLE ARG_VERSION
+        )
+    endif()
+
+    set(nupkg_path ${ARG_OUTPUT_DIR}/${ARG_ID}.${ARG_VERSION}/${ARG_ID}.${ARG_VERSION}.nupkg)
+
+    if(NOT EXISTS ${nupkg_path})
+        message(STATUS "NuGet: adding package ${ARG_ID}.${ARG_VERSION}")
+
+        EnsureNugetExists(${nuget_exe_path})
+
+        set(retry_count 0)
+        set(max_retries 10)
+        set(retry_delay 10)
+        set(result 1)
+
+        # Run NuGet CLI to download the package.
+        while(NOT ${result} STREQUAL "0" AND ${retry_count} LESS ${max_retries})
+            message(STATUS "'${nuget_exe_path}' install '${ARG_ID}' -Version '${ARG_VERSION}' -Source '${ARG_SOURCE}' -OutputDirectory '${ARG_OUTPUT_DIR}' -DependencyVersion Ignore -Verbosity quiet")
+            execute_process(
+                COMMAND 
+                ${nuget_exe_path} 
+                install ${ARG_ID}
+                -Version ${ARG_VERSION}
+                -Source ${ARG_SOURCE}
+                -OutputDirectory ${ARG_OUTPUT_DIR}
+                -DependencyVersion Ignore
+                -Verbosity quiet
+                RESULT_VARIABLE result
+            )
+            if(NOT ${result} STREQUAL "0")
+                math(EXPR retry_count "${retry_count} + 1")
+
+                message(STATUS "Nuget failed: '${result}'. Retrying in ${retry_delay} seconds...")
+                execute_process(
+                    COMMAND 
+                    ${CMAKE_COMMAND} -E sleep ${retry_delay}
+                )
+            endif()
+        endwhile()
+
+        if(NOT ${result} STREQUAL "0")
+            message(FATAL_ERROR "NuGet failed: '${result}' Package  '${ARG_NAME}' (${ARG_ID}.${ARG_VERSION})")
+        endif()
+    endif()
+
+    # Set output variable. The NAME parameter is optional: if it's not set then the package ID is used as the
+    # name. The reason for having a separate NAME is for packages whose IDs change (e.g. GDK) so that callers
+    # can use a fixed name in dependents. Example, targets can reference gdk_SOURCE_DIR with the snippet below
+    # instead of having to reference Microsoft.GDK.PC.230300_SOURCE_DIR.
+    #
+    # FetchNuGetPackage(
+    #     NAME gdk
+    #     ID Microsoft.GDK.PC.220300
+    #     VERSION 10.0.22621.3049
+    # )
+    set(${ARG_NAME}_SOURCE_DIR ${ARG_OUTPUT_DIR}/${ARG_ID}.${ARG_VERSION} PARENT_SCOPE)
+endfunction()
+
+# Begin the 'main' logic of this file. Previous code is all defintions.
+message("USE_WARP_FROM_NUGET: ${USE_WARP_FROM_NUGET}")
+if(${USE_WARP_FROM_NUGET} STREQUAL "LATEST_RELEASE" OR ${USE_WARP_FROM_NUGET} STREQUAL "LATEST_PREVIEW")
+
+  message("Fetching warp from nuget")
+
+  FetchNuGetPackage(ID Microsoft.Direct3D.WARP OUTPUT_DIR ${BINARY_DIR}/temp RELEASE_TYPE ${USE_WARP_FROM_NUGET})
+
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
+    set(ARCH "x64")
+  endif()
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "X86")
+    set(ARCH "win32")
+  endif()
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ARM64")
+    set(ARCH "arm64")
+  endif()
+
+  set(WARP_SOURCE_PATH "${Microsoft.Direct3D.WARP_SOURCE_DIR}/build/native/bin/${ARCH}")
+  set(WARP_DEST_PATH "${BINARY_DIR}/${BUILD_TYPE}/bin/")
+  message("Copying d3d10warp.dll and d3d10warp.pdb \n"
+           "  from:  ${WARP_SOURCE_PATH}\n"
+           "  to: ${WARP_DEST_PATH}")
+  file(COPY "${WARP_SOURCE_PATH}/d3d10warp.dll" 
+       DESTINATION "${WARP_DEST_PATH}")
+  file(COPY "${WARP_SOURCE_PATH}/d3d10warp.pdb" 
+       DESTINATION "${WARP_DEST_PATH}")
+endif()
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
index cd7c7b874a..bd0b6d17e2 100644
--- a/docs/ReleaseNotes.md
+++ b/docs/ReleaseNotes.md
@@ -21,9 +21,12 @@ The included licenses apply to the following files:
 
 Place release notes for the upcoming release below this line and remove this line upon naming this release.
 
-- The incomplete WaveMatrix implementation has been removed.
-- DXIL Validator Hash is open sourced.
-- DXIL container validation for PSV0 part allows any content ordering inside string and semantic index tables.
+### Version 1.8.2502
+
+- The incomplete WaveMatrix implementation has been removed. [#6807](https://github.com/microsoft/DirectXShaderCompiler/pull/6807)
+- DXIL Validator Hash is open sourced. [#6846](https://github.com/microsoft/DirectXShaderCompiler/pull/6846)
+- DXIL container validation for PSV0 part allows any content ordering inside string and semantic index tables. [#6859](https://github.com/microsoft/DirectXShaderCompiler/pull/6859)
+- The and() and or() intrinsics will now accept non-integer parameters by casting them to bools. [#7060](https://github.com/microsoft/DirectXShaderCompiler/pull/7060)
 
 ### Version 1.8.2407
 
diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index 5093dd0870..c30286e4e6 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -4145,7 +4145,7 @@ GL_EXT_spirv_intrinsics is an extension of GLSL that allows users to embed
 arbitrary SPIR-V instructions in the GLSL code similar to the concept of
 inline assembly in the C code. We support the HLSL version of
 GL_EXT_spirv_intrinsics. See
-`wiki <https://github.com/microsoft/DirectXShaderCompiler/wiki/GL_EXT_spirv_intrinsics-for-SPIR-V-code-gen>`_
+`wiki <https://github.com/microsoft/DirectXShaderCompiler/wiki/Inline-SPIR%E2%80%90V>`_
 for the details.
 
 Supported Command-line Options
diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 2449915754..aeb214f48d 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -29,7 +29,7 @@ namespace DXIL {
 const unsigned kDxilMajor = 1;
 /* <py::lines('VALRULE-TEXT')>hctdb_instrhelp.get_dxil_version_minor()</py>*/
 // VALRULE-TEXT:BEGIN
-const unsigned kDxilMinor = 8;
+const unsigned kDxilMinor = 9;
 // VALRULE-TEXT:END
 
 inline unsigned MakeDxilVersion(unsigned DxilMajor, unsigned DxilMinor) {
diff --git a/include/dxc/DXIL/DxilShaderModel.h b/include/dxc/DXIL/DxilShaderModel.h
index 293797ba12..c5c5d66884 100644
--- a/include/dxc/DXIL/DxilShaderModel.h
+++ b/include/dxc/DXIL/DxilShaderModel.h
@@ -26,15 +26,24 @@ class ShaderModel {
 public:
   using Kind = DXIL::ShaderKind;
 
-  // Major/Minor version of highest shader model
+  // Major/Minor version of highest recognized shader model
   // clang-format off
   // Python lines need to be not formatted.
   /* <py::lines('VALRULE-TEXT')>hctdb_instrhelp.get_highest_shader_model()</py>*/
   // clang-format on
   // VALRULE-TEXT:BEGIN
   static const unsigned kHighestMajor = 6;
-  static const unsigned kHighestMinor = 8;
+  static const unsigned kHighestMinor = 9;
   // VALRULE-TEXT:END
+
+  // Major/Minor version of highest released shader model
+  /* <py::lines('VALRULE-TEXT')>hctdb_instrhelp.get_highest_released_shader_model()</py>*/
+  // clang-format on
+  // VALRULE-TEXT:BEGIN
+  static const unsigned kHighestReleasedMajor = 6;
+  static const unsigned kHighestReleasedMinor = 8;
+  // VALRULE-TEXT:END
+
   static const unsigned kOfflineMinor = 0xF;
 
   bool IsPS() const { return m_Kind == Kind::Pixel; }
@@ -74,6 +83,7 @@ class ShaderModel {
   bool IsSM66Plus() const { return IsSMAtLeast(6, 6); }
   bool IsSM67Plus() const { return IsSMAtLeast(6, 7); }
   bool IsSM68Plus() const { return IsSMAtLeast(6, 8); }
+  bool IsSM69Plus() const { return IsSMAtLeast(6, 9); }
   // VALRULE-TEXT:END
   const char *GetName() const { return m_pszName; }
   const char *GetKindName() const;
@@ -85,6 +95,8 @@ class ShaderModel {
   static const ShaderModel *Get(Kind Kind, unsigned Major, unsigned Minor);
   static const ShaderModel *GetByName(llvm::StringRef Name);
   static const char *GetKindName(Kind kind);
+  static bool IsPreReleaseShaderModel(int Major, int Minor);
+  static Kind GetKindFromName(llvm::StringRef Name);
   static DXIL::ShaderKind KindFromFullName(llvm::StringRef Name);
   static const llvm::StringRef FullNameFromKind(DXIL::ShaderKind sk);
   static const char *GetNodeLaunchTypeName(DXIL::NodeLaunchType launchTy);
@@ -121,7 +133,7 @@ class ShaderModel {
               bool m_bTypedUavs, unsigned m_UAVRegsLim);
   /* <py::lines('VALRULE-TEXT')>hctdb_instrhelp.get_num_shader_models()</py>*/
   // VALRULE-TEXT:BEGIN
-  static const unsigned kNumShaderModels = 92;
+  static const unsigned kNumShaderModels = 101;
   // VALRULE-TEXT:END
   static const ShaderModel ms_ShaderModels[kNumShaderModels];
 
diff --git a/include/dxc/DxilContainer/DxilContainer.h b/include/dxc/DxilContainer/DxilContainer.h
index 80e6308430..8b7d85954b 100644
--- a/include/dxc/DxilContainer/DxilContainer.h
+++ b/include/dxc/DxilContainer/DxilContainer.h
@@ -36,6 +36,9 @@ struct DxilContainerHash {
   uint8_t Digest[DxilContainerHashSize];
 };
 
+static const DxilContainerHash PreviewByPassHash = {2, 2, 2, 2, 2, 2, 2, 2,
+                                                    2, 2, 2, 2, 2, 2, 2, 2};
+
 enum class DxilShaderHashFlags : uint32_t {
   None = 0,           // No flags defined.
   IncludesSource = 1, // This flag indicates that the shader hash was computed
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
index c8ed2cbd2a..fcc9bb11b1 100644
--- a/include/dxc/HlslIntrinsicOp.h
+++ b/include/dxc/HlslIntrinsicOp.h
@@ -367,6 +367,7 @@ enum class IntrinsicOp {
   IOP_WavePrefixUSum,
   IOP_uabs,
   IOP_uclamp,
+  IOP_udot,
   IOP_ufirstbithigh,
   IOP_umad,
   IOP_umax,
@@ -391,6 +392,7 @@ inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   case IntrinsicOp::IOP_WavePrefixSum:
   case IntrinsicOp::IOP_abs:
   case IntrinsicOp::IOP_clamp:
+  case IntrinsicOp::IOP_dot:
   case IntrinsicOp::IOP_firstbithigh:
   case IntrinsicOp::IOP_mad:
   case IntrinsicOp::IOP_max:
@@ -432,6 +434,8 @@ inline unsigned GetUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
     return static_cast<unsigned>(IntrinsicOp::IOP_uabs);
   case IntrinsicOp::IOP_clamp:
     return static_cast<unsigned>(IntrinsicOp::IOP_uclamp);
+  case IntrinsicOp::IOP_dot:
+    return static_cast<unsigned>(IntrinsicOp::IOP_udot);
   case IntrinsicOp::IOP_firstbithigh:
     return static_cast<unsigned>(IntrinsicOp::IOP_ufirstbithigh);
   case IntrinsicOp::IOP_mad:
diff --git a/include/dxc/Support/ErrorCodes.h b/include/dxc/Support/ErrorCodes.h
index 7a5830fe8f..5239c8118c 100644
--- a/include/dxc/Support/ErrorCodes.h
+++ b/include/dxc/Support/ErrorCodes.h
@@ -153,3 +153,8 @@
 // 0X80AA001E - External validator (DXIL.dll) required, and missing.
 #define DXC_E_VALIDATOR_MISSING                                                \
   DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR, FACILITY_DXC, (0x001E))
+
+// 0X80AA001F - DXIL container Program Version mismatches Dxil module shader
+// model
+#define DXC_E_INCORRECT_PROGRAM_VERSION                                        \
+  DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR, FACILITY_DXC, (0x001F))
\ No newline at end of file
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index b7bf232070..130e19a525 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -430,7 +430,7 @@ def fvk_bind_counter_heap : MultiArg<["-"], "fvk-bind-counter-heap", 2>, MetaVar
 def target_profile : JoinedOrSeparate<["-", "/"], "T">, Flags<[CoreOption]>, Group<hlslcomp_Group>, MetaVarName<"<profile>">,
   /* <py::lines('VALRULE-TEXT')>hctdb_instrhelp.get_target_profiles()</py>*/
   // VALRULE-TEXT:BEGIN
-  HelpText<"Set target profile. \n\t<profile>: ps_6_0, ps_6_1, ps_6_2, ps_6_3, ps_6_4, ps_6_5, ps_6_6, ps_6_7, ps_6_8, \n\t\t vs_6_0, vs_6_1, vs_6_2, vs_6_3, vs_6_4, vs_6_5, vs_6_6, vs_6_7, vs_6_8, \n\t\t gs_6_0, gs_6_1, gs_6_2, gs_6_3, gs_6_4, gs_6_5, gs_6_6, gs_6_7, gs_6_8, \n\t\t hs_6_0, hs_6_1, hs_6_2, hs_6_3, hs_6_4, hs_6_5, hs_6_6, hs_6_7, hs_6_8, \n\t\t ds_6_0, ds_6_1, ds_6_2, ds_6_3, ds_6_4, ds_6_5, ds_6_6, ds_6_7, ds_6_8, \n\t\t cs_6_0, cs_6_1, cs_6_2, cs_6_3, cs_6_4, cs_6_5, cs_6_6, cs_6_7, cs_6_8, \n\t\t lib_6_1, lib_6_2, lib_6_3, lib_6_4, lib_6_5, lib_6_6, lib_6_7, lib_6_8, \n\t\t ms_6_5, ms_6_6, ms_6_7, ms_6_8, \n\t\t as_6_5, as_6_6, as_6_7, as_6_8, \n\t\t ">;
+  HelpText<"Set target profile. \n\t<profile>: ps_6_0, ps_6_1, ps_6_2, ps_6_3, ps_6_4, ps_6_5, ps_6_6, ps_6_7, ps_6_8, ps_6_9, \n\t\t vs_6_0, vs_6_1, vs_6_2, vs_6_3, vs_6_4, vs_6_5, vs_6_6, vs_6_7, vs_6_8, vs_6_9, \n\t\t gs_6_0, gs_6_1, gs_6_2, gs_6_3, gs_6_4, gs_6_5, gs_6_6, gs_6_7, gs_6_8, gs_6_9, \n\t\t hs_6_0, hs_6_1, hs_6_2, hs_6_3, hs_6_4, hs_6_5, hs_6_6, hs_6_7, hs_6_8, hs_6_9, \n\t\t ds_6_0, ds_6_1, ds_6_2, ds_6_3, ds_6_4, ds_6_5, ds_6_6, ds_6_7, ds_6_8, ds_6_9, \n\t\t cs_6_0, cs_6_1, cs_6_2, cs_6_3, cs_6_4, cs_6_5, cs_6_6, cs_6_7, cs_6_8, cs_6_9, \n\t\t lib_6_1, lib_6_2, lib_6_3, lib_6_4, lib_6_5, lib_6_6, lib_6_7, lib_6_8, lib_6_9, \n\t\t ms_6_5, ms_6_6, ms_6_7, ms_6_8, ms_6_9, \n\t\t as_6_5, as_6_6, as_6_7, as_6_8, as_6_9, \n\t\t ">;
   // VALRULE-TEXT:END
 def entrypoint :  JoinedOrSeparate<["-", "/"], "E">, Flags<[CoreOption, RewriteOption]>, Group<hlslcomp_Group>,
   HelpText<"Entry point name">;
diff --git a/lib/Analysis/VectorUtils2.cpp b/lib/Analysis/VectorUtils2.cpp
index b8cac7a9a6..97bae15abe 100644
--- a/lib/Analysis/VectorUtils2.cpp
+++ b/lib/Analysis/VectorUtils2.cpp
@@ -42,6 +42,10 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
     if (EltNo == IIElt)
       return III->getOperand(1);
 
+    // Guard against infinite loop on malformed, unreachable IR.
+    if (III == III->getOperand(0))
+      return nullptr;
+
     // Otherwise, the insertelement doesn't modify the value, recurse on its
     // vector input.
     return findScalarElement(III->getOperand(0), EltNo);
diff --git a/lib/DXIL/DxilShaderModel.cpp b/lib/DXIL/DxilShaderModel.cpp
index 22bf5f545b..861e8586e0 100644
--- a/lib/DXIL/DxilShaderModel.cpp
+++ b/lib/DXIL/DxilShaderModel.cpp
@@ -64,6 +64,7 @@ bool ShaderModel::IsValidForDxil() const {
     case 6:
     case 7:
     case 8:
+    case 9:
       // VALRULE-TEXT:END
       return true;
     case kOfflineMinor:
@@ -91,85 +92,94 @@ const ShaderModel *ShaderModel::Get(Kind Kind, unsigned Major, unsigned Minor) {
       {1542, 10},   // ps_6_6
       {1543, 11},   // ps_6_7
       {1544, 12},   // ps_6_8
-      {66560, 13},  // vs_4_0
-      {66561, 14},  // vs_4_1
-      {66816, 15},  // vs_5_0
-      {66817, 16},  // vs_5_1
-      {67072, 17},  // vs_6_0
-      {67073, 18},  // vs_6_1
-      {67074, 19},  // vs_6_2
-      {67075, 20},  // vs_6_3
-      {67076, 21},  // vs_6_4
-      {67077, 22},  // vs_6_5
-      {67078, 23},  // vs_6_6
-      {67079, 24},  // vs_6_7
-      {67080, 25},  // vs_6_8
-      {132096, 26}, // gs_4_0
-      {132097, 27}, // gs_4_1
-      {132352, 28}, // gs_5_0
-      {132353, 29}, // gs_5_1
-      {132608, 30}, // gs_6_0
-      {132609, 31}, // gs_6_1
-      {132610, 32}, // gs_6_2
-      {132611, 33}, // gs_6_3
-      {132612, 34}, // gs_6_4
-      {132613, 35}, // gs_6_5
-      {132614, 36}, // gs_6_6
-      {132615, 37}, // gs_6_7
-      {132616, 38}, // gs_6_8
-      {197888, 39}, // hs_5_0
-      {197889, 40}, // hs_5_1
-      {198144, 41}, // hs_6_0
-      {198145, 42}, // hs_6_1
-      {198146, 43}, // hs_6_2
-      {198147, 44}, // hs_6_3
-      {198148, 45}, // hs_6_4
-      {198149, 46}, // hs_6_5
-      {198150, 47}, // hs_6_6
-      {198151, 48}, // hs_6_7
-      {198152, 49}, // hs_6_8
-      {263424, 50}, // ds_5_0
-      {263425, 51}, // ds_5_1
-      {263680, 52}, // ds_6_0
-      {263681, 53}, // ds_6_1
-      {263682, 54}, // ds_6_2
-      {263683, 55}, // ds_6_3
-      {263684, 56}, // ds_6_4
-      {263685, 57}, // ds_6_5
-      {263686, 58}, // ds_6_6
-      {263687, 59}, // ds_6_7
-      {263688, 60}, // ds_6_8
-      {328704, 61}, // cs_4_0
-      {328705, 62}, // cs_4_1
-      {328960, 63}, // cs_5_0
-      {328961, 64}, // cs_5_1
-      {329216, 65}, // cs_6_0
-      {329217, 66}, // cs_6_1
-      {329218, 67}, // cs_6_2
-      {329219, 68}, // cs_6_3
-      {329220, 69}, // cs_6_4
-      {329221, 70}, // cs_6_5
-      {329222, 71}, // cs_6_6
-      {329223, 72}, // cs_6_7
-      {329224, 73}, // cs_6_8
-      {394753, 74}, // lib_6_1
-      {394754, 75}, // lib_6_2
-      {394755, 76}, // lib_6_3
-      {394756, 77}, // lib_6_4
-      {394757, 78}, // lib_6_5
-      {394758, 79}, // lib_6_6
-      {394759, 80}, // lib_6_7
-      {394760, 81}, // lib_6_8
+      {1545, 13},   // ps_6_9
+      {66560, 14},  // vs_4_0
+      {66561, 15},  // vs_4_1
+      {66816, 16},  // vs_5_0
+      {66817, 17},  // vs_5_1
+      {67072, 18},  // vs_6_0
+      {67073, 19},  // vs_6_1
+      {67074, 20},  // vs_6_2
+      {67075, 21},  // vs_6_3
+      {67076, 22},  // vs_6_4
+      {67077, 23},  // vs_6_5
+      {67078, 24},  // vs_6_6
+      {67079, 25},  // vs_6_7
+      {67080, 26},  // vs_6_8
+      {67081, 27},  // vs_6_9
+      {132096, 28}, // gs_4_0
+      {132097, 29}, // gs_4_1
+      {132352, 30}, // gs_5_0
+      {132353, 31}, // gs_5_1
+      {132608, 32}, // gs_6_0
+      {132609, 33}, // gs_6_1
+      {132610, 34}, // gs_6_2
+      {132611, 35}, // gs_6_3
+      {132612, 36}, // gs_6_4
+      {132613, 37}, // gs_6_5
+      {132614, 38}, // gs_6_6
+      {132615, 39}, // gs_6_7
+      {132616, 40}, // gs_6_8
+      {132617, 41}, // gs_6_9
+      {197888, 42}, // hs_5_0
+      {197889, 43}, // hs_5_1
+      {198144, 44}, // hs_6_0
+      {198145, 45}, // hs_6_1
+      {198146, 46}, // hs_6_2
+      {198147, 47}, // hs_6_3
+      {198148, 48}, // hs_6_4
+      {198149, 49}, // hs_6_5
+      {198150, 50}, // hs_6_6
+      {198151, 51}, // hs_6_7
+      {198152, 52}, // hs_6_8
+      {198153, 53}, // hs_6_9
+      {263424, 54}, // ds_5_0
+      {263425, 55}, // ds_5_1
+      {263680, 56}, // ds_6_0
+      {263681, 57}, // ds_6_1
+      {263682, 58}, // ds_6_2
+      {263683, 59}, // ds_6_3
+      {263684, 60}, // ds_6_4
+      {263685, 61}, // ds_6_5
+      {263686, 62}, // ds_6_6
+      {263687, 63}, // ds_6_7
+      {263688, 64}, // ds_6_8
+      {263689, 65}, // ds_6_9
+      {328704, 66}, // cs_4_0
+      {328705, 67}, // cs_4_1
+      {328960, 68}, // cs_5_0
+      {328961, 69}, // cs_5_1
+      {329216, 70}, // cs_6_0
+      {329217, 71}, // cs_6_1
+      {329218, 72}, // cs_6_2
+      {329219, 73}, // cs_6_3
+      {329220, 74}, // cs_6_4
+      {329221, 75}, // cs_6_5
+      {329222, 76}, // cs_6_6
+      {329223, 77}, // cs_6_7
+      {329224, 78}, // cs_6_8
+      {329225, 79}, // cs_6_9
+      {394753, 80}, // lib_6_1
+      {394754, 81}, // lib_6_2
+      {394755, 82}, // lib_6_3
+      {394756, 83}, // lib_6_4
+      {394757, 84}, // lib_6_5
+      {394758, 85}, // lib_6_6
+      {394759, 86}, // lib_6_7
+      {394760, 87}, // lib_6_8
+      {394761, 88}, // lib_6_9
       // lib_6_x is for offline linking only, and relaxes restrictions
-      {394767, 82}, // lib_6_x
-      {853509, 83}, // ms_6_5
-      {853510, 84}, // ms_6_6
-      {853511, 85}, // ms_6_7
-      {853512, 86}, // ms_6_8
-      {919045, 87}, // as_6_5
-      {919046, 88}, // as_6_6
-      {919047, 89}, // as_6_7
-      {919048, 90}, // as_6_8
+      {394767, 89}, // lib_6_x
+      {853509, 90}, // ms_6_5
+      {853510, 91}, // ms_6_6
+      {853511, 92}, // ms_6_7
+      {853512, 93}, // ms_6_8
+      {853513, 94}, // ms_6_9
+      {919045, 95}, // as_6_5
+      {919046, 96}, // as_6_6
+      {919047, 97}, // as_6_7
+      {919048, 98}, // as_6_8
+      {919049, 99}, // as_6_9
   };
   unsigned hash = (unsigned)Kind << 16 | Major << 8 | Minor;
   auto pred = [](const std::pair<unsigned, unsigned> &elem, unsigned val) {
@@ -183,11 +193,21 @@ const ShaderModel *ShaderModel::Get(Kind Kind, unsigned Major, unsigned Minor) {
   // VALRULE-TEXT:END
 }
 
-const ShaderModel *ShaderModel::GetByName(llvm::StringRef Name) {
-  // [ps|vs|gs|hs|ds|cs|ms|as]_[major]_[minor]
+bool ShaderModel::IsPreReleaseShaderModel(int major, int minor) {
+  if (DXIL::CompareVersions(major, minor, kHighestReleasedMajor,
+                            kHighestReleasedMinor) <= 0)
+    return false;
+
+  // now compare against highest recognized
+  if (DXIL::CompareVersions(major, minor, kHighestMajor, kHighestMinor) <= 0)
+    return true;
+  return false;
+}
+
+ShaderModel::Kind ShaderModel::GetKindFromName(llvm::StringRef Name) {
   Kind kind;
   if (Name.empty()) {
-    return GetInvalid();
+    return Kind::Invalid;
   }
 
   switch (Name[0]) {
@@ -219,8 +239,17 @@ const ShaderModel *ShaderModel::GetByName(llvm::StringRef Name) {
     kind = Kind::Amplification;
     break;
   default:
-    return GetInvalid();
+    return Kind::Invalid;
   }
+  return kind;
+}
+
+const ShaderModel *ShaderModel::GetByName(llvm::StringRef Name) {
+  // [ps|vs|gs|hs|ds|cs|ms|as]_[major]_[minor]
+  Kind kind = GetKindFromName(Name);
+  if (kind == Kind::Invalid)
+    return GetInvalid();
+
   unsigned Idx = 3;
   if (kind != Kind::Library) {
     if (Name[1] != 's' || Name[2] != '_')
@@ -303,6 +332,12 @@ const ShaderModel *ShaderModel::GetByName(llvm::StringRef Name) {
       break;
     } else
       return GetInvalid();
+  case '9':
+    if (Major == 6) {
+      Minor = 9;
+      break;
+    } else
+      return GetInvalid();
     // VALRULE-TEXT:END
   case 'x':
     if (kind == Kind::Library && Major == 6) {
@@ -354,8 +389,11 @@ void ShaderModel::GetDxilVersion(unsigned &DxilMajor,
   case 8:
     DxilMinor = 8;
     break;
+  case 9:
+    DxilMinor = 9;
+    break;
   case kOfflineMinor: // Always update this to highest dxil version
-    DxilMinor = 8;
+    DxilMinor = 9;
     break;
   // VALRULE-TEXT:END
   default:
@@ -401,6 +439,9 @@ void ShaderModel::GetMinValidatorVersion(unsigned &ValMajor,
   case 8:
     ValMinor = 8;
     break;
+  case 9:
+    ValMinor = 9;
+    break;
   // VALRULE-TEXT:END
   case kOfflineMinor:
     ValMajor = 0;
@@ -541,6 +582,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Pixel, 6, 6, "ps_6_6", 32, 8, true, true, UINT_MAX),
     SM(Kind::Pixel, 6, 7, "ps_6_7", 32, 8, true, true, UINT_MAX),
     SM(Kind::Pixel, 6, 8, "ps_6_8", 32, 8, true, true, UINT_MAX),
+    SM(Kind::Pixel, 6, 9, "ps_6_9", 32, 8, true, true, UINT_MAX),
     SM(Kind::Vertex, 4, 0, "vs_4_0", 16, 16, false, false, 0),
     SM(Kind::Vertex, 4, 1, "vs_4_1", 32, 32, false, false, 0),
     SM(Kind::Vertex, 5, 0, "vs_5_0", 32, 32, true, true, 64),
@@ -554,6 +596,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Vertex, 6, 6, "vs_6_6", 32, 32, true, true, UINT_MAX),
     SM(Kind::Vertex, 6, 7, "vs_6_7", 32, 32, true, true, UINT_MAX),
     SM(Kind::Vertex, 6, 8, "vs_6_8", 32, 32, true, true, UINT_MAX),
+    SM(Kind::Vertex, 6, 9, "vs_6_9", 32, 32, true, true, UINT_MAX),
     SM(Kind::Geometry, 4, 0, "gs_4_0", 16, 32, false, false, 0),
     SM(Kind::Geometry, 4, 1, "gs_4_1", 32, 32, false, false, 0),
     SM(Kind::Geometry, 5, 0, "gs_5_0", 32, 32, true, true, 64),
@@ -567,6 +610,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Geometry, 6, 6, "gs_6_6", 32, 32, true, true, UINT_MAX),
     SM(Kind::Geometry, 6, 7, "gs_6_7", 32, 32, true, true, UINT_MAX),
     SM(Kind::Geometry, 6, 8, "gs_6_8", 32, 32, true, true, UINT_MAX),
+    SM(Kind::Geometry, 6, 9, "gs_6_9", 32, 32, true, true, UINT_MAX),
     SM(Kind::Hull, 5, 0, "hs_5_0", 32, 32, true, true, 64),
     SM(Kind::Hull, 5, 1, "hs_5_1", 32, 32, true, true, 64),
     SM(Kind::Hull, 6, 0, "hs_6_0", 32, 32, true, true, UINT_MAX),
@@ -578,6 +622,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Hull, 6, 6, "hs_6_6", 32, 32, true, true, UINT_MAX),
     SM(Kind::Hull, 6, 7, "hs_6_7", 32, 32, true, true, UINT_MAX),
     SM(Kind::Hull, 6, 8, "hs_6_8", 32, 32, true, true, UINT_MAX),
+    SM(Kind::Hull, 6, 9, "hs_6_9", 32, 32, true, true, UINT_MAX),
     SM(Kind::Domain, 5, 0, "ds_5_0", 32, 32, true, true, 64),
     SM(Kind::Domain, 5, 1, "ds_5_1", 32, 32, true, true, 64),
     SM(Kind::Domain, 6, 0, "ds_6_0", 32, 32, true, true, UINT_MAX),
@@ -589,6 +634,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Domain, 6, 6, "ds_6_6", 32, 32, true, true, UINT_MAX),
     SM(Kind::Domain, 6, 7, "ds_6_7", 32, 32, true, true, UINT_MAX),
     SM(Kind::Domain, 6, 8, "ds_6_8", 32, 32, true, true, UINT_MAX),
+    SM(Kind::Domain, 6, 9, "ds_6_9", 32, 32, true, true, UINT_MAX),
     SM(Kind::Compute, 4, 0, "cs_4_0", 0, 0, false, false, 0),
     SM(Kind::Compute, 4, 1, "cs_4_1", 0, 0, false, false, 0),
     SM(Kind::Compute, 5, 0, "cs_5_0", 0, 0, true, true, 64),
@@ -602,6 +648,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Compute, 6, 6, "cs_6_6", 0, 0, true, true, UINT_MAX),
     SM(Kind::Compute, 6, 7, "cs_6_7", 0, 0, true, true, UINT_MAX),
     SM(Kind::Compute, 6, 8, "cs_6_8", 0, 0, true, true, UINT_MAX),
+    SM(Kind::Compute, 6, 9, "cs_6_9", 0, 0, true, true, UINT_MAX),
     SM(Kind::Library, 6, 1, "lib_6_1", 32, 32, true, true, UINT_MAX),
     SM(Kind::Library, 6, 2, "lib_6_2", 32, 32, true, true, UINT_MAX),
     SM(Kind::Library, 6, 3, "lib_6_3", 32, 32, true, true, UINT_MAX),
@@ -610,6 +657,7 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Library, 6, 6, "lib_6_6", 32, 32, true, true, UINT_MAX),
     SM(Kind::Library, 6, 7, "lib_6_7", 32, 32, true, true, UINT_MAX),
     SM(Kind::Library, 6, 8, "lib_6_8", 32, 32, true, true, UINT_MAX),
+    SM(Kind::Library, 6, 9, "lib_6_9", 32, 32, true, true, UINT_MAX),
     // lib_6_x is for offline linking only, and relaxes restrictions
     SM(Kind::Library, 6, kOfflineMinor, "lib_6_x", 32, 32, true, true,
        UINT_MAX),
@@ -617,10 +665,12 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
     SM(Kind::Mesh, 6, 6, "ms_6_6", 0, 0, true, true, UINT_MAX),
     SM(Kind::Mesh, 6, 7, "ms_6_7", 0, 0, true, true, UINT_MAX),
     SM(Kind::Mesh, 6, 8, "ms_6_8", 0, 0, true, true, UINT_MAX),
+    SM(Kind::Mesh, 6, 9, "ms_6_9", 0, 0, true, true, UINT_MAX),
     SM(Kind::Amplification, 6, 5, "as_6_5", 0, 0, true, true, UINT_MAX),
     SM(Kind::Amplification, 6, 6, "as_6_6", 0, 0, true, true, UINT_MAX),
     SM(Kind::Amplification, 6, 7, "as_6_7", 0, 0, true, true, UINT_MAX),
     SM(Kind::Amplification, 6, 8, "as_6_8", 0, 0, true, true, UINT_MAX),
+    SM(Kind::Amplification, 6, 9, "as_6_9", 0, 0, true, true, UINT_MAX),
     // Values before Invalid must remain sorted by Kind, then Major, then Minor.
     SM(Kind::Invalid, 0, 0, "invalid", 0, 0, false, false, 0),
     // VALRULE-TEXT:END
diff --git a/lib/DxilContainer/DxcContainerBuilder.cpp b/lib/DxilContainer/DxcContainerBuilder.cpp
index 3c10b0e70a..770aa910a4 100644
--- a/lib/DxilContainer/DxcContainerBuilder.cpp
+++ b/lib/DxilContainer/DxcContainerBuilder.cpp
@@ -104,7 +104,11 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::RemovePart(UINT32 fourCC) {
 
 HRESULT STDMETHODCALLTYPE
 DxcContainerBuilder::SerializeContainer(IDxcOperationResult **ppResult) {
+  if (ppResult == nullptr)
+    return E_INVALIDARG;
+
   DxcThreadMalloc TM(m_pMalloc);
+
   try {
     // Allocate memory for new dxil container.
     uint32_t ContainerSize = ComputeContainerSize();
@@ -161,6 +165,11 @@ DxcContainerBuilder::SerializeContainer(IDxcOperationResult **ppResult) {
       errorHeap.Detach();
     }
 
+    // Add Hash.
+    if (SUCCEEDED(valHR))
+      HashAndUpdate(IsDxilContainerLike(pResult->GetBufferPointer(),
+                                        pResult->GetBufferSize()));
+
     IFT(DxcResult::Create(
         valHR, DXC_OUT_OBJECT,
         {DxcOutputObject::DataOutput(DXC_OUT_OBJECT, pResult, DxcOutNoName),
@@ -169,21 +178,6 @@ DxcContainerBuilder::SerializeContainer(IDxcOperationResult **ppResult) {
   }
   CATCH_CPP_RETURN_HRESULT();
 
-  if (ppResult == nullptr || *ppResult == nullptr)
-    return S_OK;
-
-  HRESULT HR;
-  (*ppResult)->GetStatus(&HR);
-  if (FAILED(HR))
-    return HR;
-
-  CComPtr<IDxcBlob> pObject;
-  IFR((*ppResult)->GetResult(&pObject));
-
-  // Add Hash.
-  LPVOID PTR = pObject->GetBufferPointer();
-  if (IsDxilContainerLike(PTR, pObject->GetBufferSize()))
-    HashAndUpdate((DxilContainerHeader *)PTR);
   return S_OK;
 }
 
diff --git a/lib/DxilValidation/DxilContainerValidation.cpp b/lib/DxilValidation/DxilContainerValidation.cpp
index 2276b0d3de..890e90e354 100644
--- a/lib/DxilValidation/DxilContainerValidation.cpp
+++ b/lib/DxilValidation/DxilContainerValidation.cpp
@@ -1033,8 +1033,29 @@ HRESULT ValidateDxilContainerParts(llvm::Module *pModule,
     case DFCC_ResourceDef:
     case DFCC_ShaderStatistics:
     case DFCC_PrivateData:
+      break;
     case DFCC_DXIL:
-    case DFCC_ShaderDebugInfoDXIL:
+    case DFCC_ShaderDebugInfoDXIL: {
+      const DxilProgramHeader *pProgramHeader =
+          reinterpret_cast<const DxilProgramHeader *>(GetDxilPartData(pPart));
+      if (!pProgramHeader)
+        continue;
+
+      int PV = pProgramHeader->ProgramVersion;
+      int major = (PV >> 4) & 0xF; // Extract the major version (next 4 bits)
+      int minor = PV & 0xF;        // Extract the minor version (lowest 4 bits)
+
+      int moduleMajor = pDxilModule->GetShaderModel()->GetMajor();
+      int moduleMinor = pDxilModule->GetShaderModel()->GetMinor();
+      if (moduleMajor != major || moduleMinor != minor) {
+        ValCtx.EmitFormatError(ValidationRule::SmProgramVersion,
+                               {std::to_string(major), std::to_string(minor),
+                                std::to_string(moduleMajor),
+                                std::to_string(moduleMinor)});
+        return DXC_E_INCORRECT_PROGRAM_VERSION;
+      }
+      continue;
+    }
     case DFCC_ShaderDebugName:
       continue;
 
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index b07577374f..6377bba8c5 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -2480,7 +2480,8 @@ Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   if (Ty->getScalarType()->isFloatingPointTy()) {
     return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
   } else {
-    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder);
+    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
+                         IOP == IntrinsicOp::IOP_udot);
   }
 }
 
@@ -6789,6 +6790,7 @@ IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::WavePrefixOp},
     {IntrinsicOp::IOP_uabs, TranslateUAbs, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_uclamp, TranslateClamp, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_udot, TranslateDot, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_ufirstbithigh, TranslateFirstbitHi,
      DXIL::OpCode::FirstbitHi},
     {IntrinsicOp::IOP_umad, TranslateFUITrinary, DXIL::OpCode::UMad},
diff --git a/lib/Support/assert.cpp b/lib/Support/assert.cpp
index 991ae01857..75111ea405 100644
--- a/lib/Support/assert.cpp
+++ b/lib/Support/assert.cpp
@@ -10,6 +10,8 @@
 #include "assert.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
+
+#if defined(LLVM_ASSERTIONS_TRAP) || !defined(WIN32)
 namespace {
 void llvm_assert_trap(const char *_Message, const char *_File, unsigned _Line,
                       const char *_Function) {
@@ -18,6 +20,7 @@ void llvm_assert_trap(const char *_Message, const char *_File, unsigned _Line,
   LLVM_BUILTIN_TRAP;
 }
 } // namespace
+#endif
 
 #ifdef _WIN32
 #include "dxc/Support/Global.h"
diff --git a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
index f84a0e34ed..fd0fa8a3d0 100644
--- a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
+++ b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
@@ -392,10 +392,16 @@ bool isBooleanStageIOVar(const NamedDecl *decl, QualType type,
   // [[vk::builtin(...)]] makes the decl a built-in stage variable.
   // IsFrontFace (if used as PSIn) is the only known boolean built-in stage
   // variable.
-  const bool isBooleanBuiltin =
-      (decl->getAttr<VKBuiltInAttr>() != nullptr) ||
-      (semanticKind == hlsl::Semantic::Kind::IsFrontFace &&
-       sigPointKind == hlsl::SigPoint::Kind::PSIn);
+  bool isBooleanBuiltin = false;
+
+  if ((decl->getAttr<VKBuiltInAttr>() != nullptr))
+    isBooleanBuiltin = true;
+  else if (semanticKind == hlsl::Semantic::Kind::IsFrontFace &&
+           sigPointKind == hlsl::SigPoint::Kind::PSIn) {
+    isBooleanBuiltin = true;
+  } else if (semanticKind == hlsl::Semantic::Kind::CullPrimitive) {
+    isBooleanBuiltin = true;
+  }
 
   // TODO: support boolean matrix stage I/O variable if needed.
   QualType elemType = {};
@@ -1816,24 +1822,24 @@ void DeclResultIdMapper::createCounterVar(
   }
 
   const SpirvType *counterType = spvContext.getACSBufferCounterType();
+  llvm::Optional<uint32_t> noArrayStride;
   QualType declType = decl->getType();
   if (declType->isArrayType()) {
     // Vulkan does not support multi-dimentional arrays of resource, so we
     // assume the array is a single dimensional array.
     assert(!declType->getArrayElementTypeNoTypeQual()->isArrayType());
-    uint32_t arrayStride = 4;
+
     if (const auto *constArrayType =
             astContext.getAsConstantArrayType(declType)) {
       counterType = spvContext.getArrayType(
-          counterType, constArrayType->getSize().getZExtValue(), arrayStride);
+          counterType, constArrayType->getSize().getZExtValue(), noArrayStride);
     } else {
       assert(declType->isIncompleteArrayType());
-      counterType = spvContext.getRuntimeArrayType(counterType, arrayStride);
+      counterType = spvContext.getRuntimeArrayType(counterType, noArrayStride);
     }
   } else if (isResourceDescriptorHeap(decl->getType()) ||
              isSamplerDescriptorHeap(decl->getType())) {
-    counterType =
-        spvContext.getRuntimeArrayType(counterType, /* arrayStride= */ 4);
+    counterType = spvContext.getRuntimeArrayType(counterType, noArrayStride);
   }
 
   // {RW|Append|Consume}StructuredBuffer are all in Uniform storage class.
diff --git a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
index 0eb2243216..9999621d31 100644
--- a/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
+++ b/tools/clang/lib/SPIRV/LowerTypeVisitor.cpp
@@ -214,6 +214,17 @@ bool LowerTypeVisitor::visitInstruction(SpirvInstruction *instr) {
                                                  arrayType->getStride());
             instr->setResultType(resultType);
           }
+        } else if (const auto *runtimeArrayType =
+                       dyn_cast<RuntimeArrayType>(resultType)) {
+          if (const auto *imageType =
+                  dyn_cast<ImageType>(runtimeArrayType->getElementType())) {
+            auto newImgType = spvContext.getImageType(
+                imageType,
+                vkImgFeatures.format.value_or(spv::ImageFormat::Unknown));
+            resultType = spvContext.getRuntimeArrayType(
+                newImgType, runtimeArrayType->getStride());
+            instr->setResultType(resultType);
+          }
         }
       }
     }
diff --git a/tools/clang/lib/SPIRV/RawBufferMethods.cpp b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
index 537c012ace..87409e7ccc 100644
--- a/tools/clang/lib/SPIRV/RawBufferMethods.cpp
+++ b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
@@ -117,48 +117,32 @@ SpirvInstruction *RawBufferHandler::load64Bits(SpirvInstruction *buffer,
   SpirvInstruction *ptr = nullptr;
   auto *constUint0 =
       spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
-  auto *constUint32 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 32));
 
+  // Load the first word and increment index.
   auto *index = address.getWordIndex(loc, range);
-
-  // Need to perform two 32-bit uint loads and construct a 64-bit value.
-
-  // Load the first 32-bit uint (word0).
   ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
                                      {constUint0, index}, loc, range);
   SpirvInstruction *word0 =
       spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc, range);
-  // Increment the base index
   address.incrementWordIndex(loc, range);
+
+  // Load the second word and increment index.
   index = address.getWordIndex(loc, range);
-  // Load the second 32-bit uint (word1).
   ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
                                      {constUint0, index}, loc, range);
   SpirvInstruction *word1 =
       spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc, range);
-
-  // Convert both word0 and word1 to 64-bit uints.
-  word0 = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, word0, loc, range);
-  word1 = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, word1, loc, range);
-
-  // Shift word1 to the left by 32 bits.
-  word1 = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
-                                    astContext.UnsignedLongLongTy, word1,
-                                    constUint32, loc, range);
-
-  // BitwiseOr word0 and word1.
-  result = spvBuilder.createBinaryOp(spv::Op::OpBitwiseOr,
-                                     astContext.UnsignedLongLongTy, word0,
-                                     word1, loc, range);
-  result = bitCastToNumericalOrBool(result, astContext.UnsignedLongLongTy,
-                                    target64BitType, loc, range);
-  result->setRValue();
-
   address.incrementWordIndex(loc, range);
 
+  // Combine the 2 words into a composite, and bitcast into the destination
+  // type.
+  const auto uintVec2Type =
+      astContext.getExtVectorType(astContext.UnsignedIntTy, 2);
+  auto *operand = spvBuilder.createCompositeConstruct(
+      uintVec2Type, {word0, word1}, loc, range);
+  result = spvBuilder.createUnaryOp(spv::Op::OpBitcast, target64BitType,
+                                    operand, loc, range);
+  result->setRValue();
   return result;
 }
 
@@ -441,39 +425,31 @@ void RawBufferHandler::store64Bits(SpirvInstruction *value,
   const auto loc = buffer->getSourceLocation();
   auto *constUint0 =
       spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
-  auto *constUint32 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 32));
 
-  auto *index = address.getWordIndex(loc, range);
+  // Bitcast the source into a 32-bit words composite.
+  const auto uintVec2Type =
+      astContext.getExtVectorType(astContext.UnsignedIntTy, 2);
+  auto *tmp = spvBuilder.createUnaryOp(spv::Op::OpBitcast, uintVec2Type, value,
+                                       loc, range);
 
-  // The underlying element type of the ByteAddressBuffer is uint. So we
-  // need to store two 32-bit values.
+  // Extract the low and high word (careful! word order).
+  auto *A = spvBuilder.createCompositeExtract(astContext.UnsignedIntTy, tmp,
+                                              {0}, loc, range);
+  auto *B = spvBuilder.createCompositeExtract(astContext.UnsignedIntTy, tmp,
+                                              {1}, loc, range);
+
+  // Store the first word, and increment counter.
+  auto *index = address.getWordIndex(loc, range);
   auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
                                            {constUint0, index}, loc, range);
-  // First convert the 64-bit value to uint64_t. Then extract two 32-bit words
-  // from it.
-  value = bitCastToNumericalOrBool(value, valueType,
-                                   astContext.UnsignedLongLongTy, loc, range);
-
-  // Use OpUConvert to perform truncation (produces the least significant bits).
-  SpirvInstruction *lsb = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedIntTy, value, loc, range);
-
-  // Shift uint64_t to the right by 32 bits and truncate to get the most
-  // significant bits.
-  SpirvInstruction *msb = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedIntTy,
-      spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
-                                astContext.UnsignedLongLongTy, value,
-                                constUint32, loc, range),
-      loc, range);
-
-  spvBuilder.createStore(ptr, lsb, loc, range);
+  spvBuilder.createStore(ptr, A, loc, range);
   address.incrementWordIndex(loc, range);
+
+  // Store the second word, and increment counter.
   index = address.getWordIndex(loc, range);
   ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
                                      {constUint0, index}, loc, range);
-  spvBuilder.createStore(ptr, msb, loc, range);
+  spvBuilder.createStore(ptr, B, loc, range);
   address.incrementWordIndex(loc, range);
 }
 
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 0d47e1fa32..f9c54fea45 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -8095,7 +8095,8 @@ void SpirvEmitter::assignToMSOutAttribute(
   // All other attribute writes are handled below.
   auto *varInstr = declIdMapper.getStageVarInstruction(decl);
   QualType valueType = value->getAstResultType();
-  if (valueType->isBooleanType()) {
+  if (valueType->isBooleanType() &&
+      semanticInfo.getKind() != hlsl::Semantic::Kind::CullPrimitive) {
     // Externally visible variables are changed to uint, so we need to cast the
     // value to uint.
     value = castToInt(value, valueType, astContext.UnsignedIntTy, loc);
@@ -8957,6 +8958,7 @@ SpirvEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     return nullptr;
   }
   case hlsl::IntrinsicOp::IOP_dot:
+  case hlsl::IntrinsicOp::IOP_udot:
     retVal = processIntrinsicDot(callExpr);
     break;
   case hlsl::IntrinsicOp::IOP_GroupMemoryBarrier:
@@ -15217,7 +15219,6 @@ bool SpirvEmitter::spirvToolsLegalize(std::vector<uint32_t> *mod,
     optimizer.RegisterPass(
         spvtools::CreateAggressiveDCEPass(spirvOptions.preserveInterface));
   }
-  optimizer.RegisterPass(spvtools::CreateReplaceInvalidOpcodePass());
   optimizer.RegisterPass(spvtools::CreateCompactIdsPass());
   optimizer.RegisterPass(spvtools::CreateSpreadVolatileSemanticsPass());
   if (spirvOptions.fixFuncCallArguments) {
diff --git a/tools/clang/test/CMakeLists.txt b/tools/clang/test/CMakeLists.txt
index 0e4733ebb7..6b0aa32fbc 100644
--- a/tools/clang/test/CMakeLists.txt
+++ b/tools/clang/test/CMakeLists.txt
@@ -147,6 +147,15 @@ if (WIN32)
           )
   set(TAEF_EXEC_ADAPTER "" CACHE STRING "adapter for taef exec test")
 
+  # Use a custom target so we can depend on it and re-run the cmake logic which downloads warp
+  # from nuget if requested.
+  add_custom_target(WarpFromNuget
+            COMMAND "${CMAKE_COMMAND}"
+            -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR} 
+            -DBUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DBINARY_DIR=${CMAKE_BINARY_DIR}
+            -P "${CMAKE_SOURCE_DIR}/cmake/modules/nuget.cmake")
+
   add_lit_target("check-clang-taef-exec" "Running lit suite hlsl execution test"
             ${CMAKE_CURRENT_SOURCE_DIR}/taef_exec
             PARAMS ${CLANG_TEST_PARAMS}
@@ -154,5 +163,13 @@ if (WIN32)
             DEPENDS ExecHLSLTests dxexp
             ARGS ${CLANG_TEST_EXTRA_ARGS}
           )
+
+  add_lit_target("check-clang-taef-exec-warp" "Running lit suite hlsl execution test with D3D WARP from nuget"
+            ${CMAKE_CURRENT_SOURCE_DIR}/taef_exec
+            PARAMS ${CLANG_TEST_PARAMS}
+                adapter=${TAEF_EXEC_ADAPTER}
+            DEPENDS ExecHLSLTests dxexp WarpFromNuget
+            ARGS ${CLANG_TEST_EXTRA_ARGS}
+          )
 endif()
 # HLSL Change End
diff --git a/tools/clang/test/CodeGenHLSL/dot.hlsl b/tools/clang/test/CodeGenHLSL/dot.hlsl
new file mode 100644
index 0000000000..25dad7c8d8
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/dot.hlsl
@@ -0,0 +1,161 @@
+// RUN: %dxc -T vs_6_0 -DFUNC=dot %s | FileCheck %s
+// RUN: %dxc -T vs_6_0 -DFUNC=mul %s | FileCheck %s
+// RUN: %dxc -T vs_6_0 -DFUNC=dot -fcgl %s | FileCheck %s --check-prefix=CGLDOT
+// RUN: %dxc -T vs_6_0 -DFUNC=mul -fcgl %s | FileCheck %s --check-prefix=CGLMUL
+
+// Verifies correct implementation of dot and mul with vectors for various sizes and types.
+
+// Partially pilfered from SPIRV's intrinsic.dot.hlsl
+
+float4 main(int1 i1[2] : IO, int2 i2[2] : IT, int3 i3[2] : IH, int4 i4[2] : IF,
+            float1 f1[2] : FO, float2 f2[2] : FT, float3 f3[2] : FH, float4 f4[2] : FF,
+            uint1 u1[2] : UO, uint2 u2[2] : UT, uint3 u3[2] : UH, uint4 u4[2] : UF) : SV_Position {
+  int i = 0;
+  // CHECK-DAG: [[I0:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 1, i8 0, i32 undef)
+  // CHECK: mul i32 [[I0]], [[I1]]
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 134, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 167, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  i += FUNC(i1[0], i1[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 1, i8 1, i32 undef)
+
+  // CHECK: [[MUL:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[MUL]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 134, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 167, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  i += FUNC(i2[0], i2[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 2, i32 1, i8 2, i32 undef)
+
+  // PING and PONG are just conveniences to track the result as it accumulates.
+  // Since we can't capture and match the source and result in the same line with the same variable.
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; IMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 134, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 167, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  i += FUNC(i3[0], i3[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I03:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[I13:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 3, i32 1, i8 3, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; IMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; IMad(a,b,c)
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 48, i32 [[I03]], i32 [[I13]], i32 [[PING]])  ; IMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 134, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 167, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  i += FUNC(i4[0], i4[1]);
+
+  float f = 0.0;
+
+  // CHECK-DAG: [[F0:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 4, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F1:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 4, i32 1, i8 0, i32 undef)
+  // CHECK: mul fast float [[F0]], [[F1]]
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <1 x float>, <1 x float>)"(i32 134, <1 x float> %{{.*}}, <1 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <1 x float>, <1 x float>)"(i32 167, <1 x float> %{{.*}}, <1 x float> %{{.*}})
+  f += FUNC(f1[0], f1[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 5, i32 1, i8 1, i32 undef)
+
+  // CHECK: call float @dx.op.dot2.f32(i32 54, float [[F00]], float [[F01]], float [[F10]], float [[F11]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <2 x float>, <2 x float>)"(i32 134, <2 x float> %{{.*}}, <2 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <2 x float>, <2 x float>)"(i32 167, <2 x float> %{{.*}}, <2 x float> %{{.*}})
+  f += FUNC(f2[0], f2[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F02:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[F12:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 6, i32 1, i8 2, i32 undef)
+
+  // CHECK: call float @dx.op.dot3.f32(i32 55, float [[F00]], float [[F01]], float [[F02]], float [[F10]], float [[F11]], float [[F12]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <3 x float>, <3 x float>)"(i32 134, <3 x float> %{{.*}}, <3 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <3 x float>, <3 x float>)"(i32 167, <3 x float> %{{.*}}, <3 x float> %{{.*}})
+  f += FUNC(f3[0], f3[1]);
+
+  // CHECK-DAG: [[F00:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[F01:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[F02:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[F03:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[F10:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[F11:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[F12:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[F13:%.*]] = call float @dx.op.loadInput.f32(i32 4, i32 7, i32 1, i8 3, i32 undef)
+
+  // CHECK: call float @dx.op.dot4.f32(i32 56, float [[F00]], float [[F01]], float [[F02]], float [[F03]], float [[F10]], float [[F11]], float [[F12]], float [[F13]])
+  // CGLDOT: call float @"dx.hl.op.rn.float (i32, <4 x float>, <4 x float>)"(i32 134, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CGLMUL: call float @"dx.hl.op.rn.float (i32, <4 x float>, <4 x float>)"(i32 167, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  f += FUNC(f4[0], f4[1]);
+
+  int u = 0;
+  // CHECK-DAG: [[I0:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 8, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 8, i32 1, i8 0, i32 undef)
+  // CHECK: mul i32 [[I0]], [[I1]]
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 349, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <1 x i32>, <1 x i32>)"(i32 354, <1 x i32> %{{.*}}, <1 x i32> %{{.*}})
+  u += FUNC(u1[0], u1[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 9, i32 1, i8 1, i32 undef)
+
+  // CHECK: [[MUL:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[MUL]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 349, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <2 x i32>, <2 x i32>)"(i32 354, <2 x i32> %{{.*}}, <2 x i32> %{{.*}})
+  u += FUNC(u2[0], u2[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 10, i32 1, i8 2, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; UMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 349, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <3 x i32>, <3 x i32>)"(i32 354, <3 x i32> %{{.*}}, <3 x i32> %{{.*}})
+  u += FUNC(u3[0], u3[1]);
+
+  // CHECK-DAG: [[I00:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 0, i32 undef)
+  // CHECK-DAG: [[I01:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 1, i32 undef)
+  // CHECK-DAG: [[I02:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 2, i32 undef)
+  // CHECK-DAG: [[I03:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 0, i8 3, i32 undef)
+  // CHECK-DAG: [[I10:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 0, i32 undef)
+  // CHECK-DAG: [[I11:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 1, i32 undef)
+  // CHECK-DAG: [[I12:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 2, i32 undef)
+  // CHECK-DAG: [[I13:%.*]] = call i32 @dx.op.loadInput.i32(i32 4, i32 11, i32 1, i8 3, i32 undef)
+
+  // CHECK: [[PING:%.*]] = mul i32 [[I00]], [[I10]]
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I01]], i32 [[I11]], i32 [[PING]])  ; UMad(a,b,c)
+  // CHECK: [[PING:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I02]], i32 [[I12]], i32 [[PONG]])  ; UMad(a,b,c)
+  // CHECK: [[PONG:%.*]] = call i32 @dx.op.tertiary.i32(i32 49, i32 [[I03]], i32 [[I13]], i32 [[PING]])  ; UMad(a,b,c)
+  // CGLDOT: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 349, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CGLMUL: call i32 @"dx.hl.op.rn.i32 (i32, <4 x i32>, <4 x i32>)"(i32 354, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  u += FUNC(u4[0], u4[1]);
+
+  return float4(i, f, u, 0);
+}
diff --git a/tools/clang/test/CodeGenHLSL/minimal2.ll b/tools/clang/test/CodeGenHLSL/minimal2.ll
new file mode 100644
index 0000000000..846ce51f06
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/minimal2.ll
@@ -0,0 +1,49 @@
+; RUN: opt -simplify-inst -S < %s | FileCheck %s
+
+%"class.Texture3D<unsigned int>" = type { i32, %"class.Texture3D<unsigned int>::mips_type" }
+%"class.Texture3D<unsigned int>::mips_type" = type { i32 }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?t1@@3V?$Texture3D@I@@A" = external global %"class.Texture3D<unsigned int>", align 4
+
+; Function Attrs: nounwind
+define void @main(<2 x i32> %dtid, i32 %laneID) #0 {
+"\01?f1@@s1@@U1@@Z.exit":
+  br label %if.end.13
+
+while.cond.6.preheader:                           ; No predecessors!
+  ; CHECK: %0 = extractelement <2 x i32> %1, i32 1
+  %0 = extractelement <2 x i32> %2, i32 1
+  %cmp7.14 = icmp ne i32 %0, 0
+  br i1 %cmp7.14, label %while.body, label %if.end.13
+
+while.body:                                       ; preds = %while.cond.6.preheader
+  %1 = extractelement <2 x i32> %2, i32 0
+  ; CHECK: %sub = sub i32 %sub, 1
+  ; CHECK: %1 = insertelement <2 x i32> %1, i32 %sub, i32 0
+  %sub = sub i32 %1, 1
+  %2 = insertelement <2 x i32> %2, i32 %sub, i32 0
+  br label %if.end.13
+
+if.end.13:                                        ; preds = %while.body, %while.cond.6.preheader, %"\01?f1@@s1@@U1@@Z.exit"
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind readnone
+declare i1 @"dx.hl.op.rn.i1 (i32, i1)"(i32, i1) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @"dx.hl.op.ro.i32 (i32, %dx.types.Handle, <4 x i32>)"(i32, %dx.types.Handle, <4 x i32>) #2
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.Texture3D<unsigned int>\22)"(i32, %"class.Texture3D<unsigned int>") #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.Texture3D<unsigned int>\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.Texture3D<unsigned int>") #1
diff --git a/tools/clang/test/CodeGenHLSL/vector-and.hlsl b/tools/clang/test/CodeGenHLSL/vector-and.hlsl
new file mode 100644
index 0000000000..d6ae00af2f
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/vector-and.hlsl
@@ -0,0 +1,141 @@
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=bool %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=bool %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=int %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=int %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=float %s | FileCheck %s --check-prefixes=CHECK,F32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=float %s | FileCheck %s --check-prefixes=CHECK,F32
+
+// I32: %dx.types.ResRet.[[TY:i32]] = type { [[TYPE:i32]]
+// F32: %dx.types.ResRet.[[TY:f32]] = type { [[TYPE:float]]
+
+// CHECK-LABEL: define void @main
+
+ByteAddressBuffer buf;
+
+float4 main() : SV_Target {
+
+  // CHECK-DAG: [[SAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 0
+  // CHECK-DAG: [[SAX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SAR]], 0
+  // I32-DAG: [[SA:%.*]] = icmp ne i32 [[SAX]], 0
+  // F32-DAG: [[SA:%.*]] = fcmp fast une float [[SAX]], 0.000000e+00
+
+  // CHECK-DAG: [[SBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 8
+  // CHECK-DAG: [[SBX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SBR]], 0
+  // I32-DAG: [[SB:%.*]] = icmp ne i32 [[SBX]], 0
+  // F32-DAG: [[SB:%.*]] = fcmp fast une float [[SBX]], 0.000000e+00
+
+  TYPE sb = buf.Load<TYPE>(8);
+  TYPE sa = buf.Load<TYPE>(0);
+
+  // CHECK: and i1 [[SB]], [[SA]]
+  TYPE res = and(sa, sb);
+
+  // CHECK-DAG: [[V1AR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 16
+  // CHECK-DAG: [[V1AX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1AR]], 0
+  // I32-DAG: [[V1A:%.*]] = icmp ne i32 [[V1AX]], 0
+  // F32-DAG: [[V1A:%.*]] = fcmp fast une float [[V1AX]], 0.000000e+00
+
+  // CHECK-DAG: [[V1BR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 24
+  // CHECK-DAG: [[V1BX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1BR]], 0
+  // I32-DAG: [[V1B:%.*]] = icmp ne i32 [[V1BX]], 0
+  // F32-DAG: [[V1B:%.*]] = fcmp fast une float [[V1BX]], 0.000000e+00
+
+  vector<TYPE, 1> v1b = buf.Load< vector<TYPE, 1> >(24);
+  vector<TYPE, 1> v1a = buf.Load< vector<TYPE, 1> >(16);
+
+  // CHECK: and i1 [[V1B]], [[V1A]]
+  vector<TYPE, 1> res1 = and(v1a, v1b);
+
+  // CHECK-DAG: [[V3AR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 32
+  // CHECK-DAG: [[V3AX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 0
+  // CHECK-DAG: [[V3AX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 1
+  // CHECK-DAG: [[V3AX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 2
+
+  // I32-DAG: [[V3A0:%.*]] = icmp ne i32 [[V3AX0]], 0
+  // I32-DAG: [[V3A1:%.*]] = icmp ne i32 [[V3AX1]], 0
+  // I32-DAG: [[V3A2:%.*]] = icmp ne i32 [[V3AX2]], 0
+
+  // F32-DAG: [[V3A0:%.*]] = fcmp fast une float [[V3AX0]], 0.000000e+00
+  // F32-DAG: [[V3A1:%.*]] = fcmp fast une float [[V3AX1]], 0.000000e+00
+  // F32-DAG: [[V3A2:%.*]] = fcmp fast une float [[V3AX2]], 0.000000e+00
+
+  // CHECK-DAG: [[V3BR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 56
+  // CHECK-DAG: [[V3BX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 0
+  // CHECK-DAG: [[V3BX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 1
+  // CHECK-DAG: [[V3BX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 2
+
+  // I32-DAG: [[V3B0:%.*]] = icmp ne i32 [[V3BX0]], 0
+  // I32-DAG: [[V3B1:%.*]] = icmp ne i32 [[V3BX1]], 0
+  // I32-DAG: [[V3B2:%.*]] = icmp ne i32 [[V3BX2]], 0
+
+  // F32-DAG: [[V3B0:%.*]] = fcmp fast une float [[V3BX0]], 0.000000e+00
+  // F32-DAG: [[V3B1:%.*]] = fcmp fast une float [[V3BX1]], 0.000000e+00
+  // F32-DAG: [[V3B2:%.*]] = fcmp fast une float [[V3BX2]], 0.000000e+00
+
+  vector<TYPE, 3> v3b = buf.Load< vector<TYPE, 3> >(56);
+  vector<TYPE, 3> v3a = buf.Load< vector<TYPE, 3> >(32);
+
+  // CHECK: and i1 [[V3B0]], [[V3A0]]
+  // CHECK: and i1 [[V3B1]], [[V3A1]]
+  // CHECK: and i1 [[V3B2]], [[V3A2]]
+  vector<TYPE, 3> res3 = and(v3a, v3b);
+
+  // CHECK-DAG: [[MAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 80
+  // CHECK-DAG: [[MAX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // CHECK-DAG: [[MAX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+  // CHECK-DAG: [[MAX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 2
+  // CHECK-DAG: [[MAX3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 3
+  // CHECK-DAG: [[MAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 96
+  // CHECK-DAG: [[MAX4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // CHECK-DAG: [[MAX5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+
+  // I32-DAG: [[MA0:%.*]] = icmp ne i32 [[MAX0]], 0
+  // I32-DAG: [[MA1:%.*]] = icmp ne i32 [[MAX1]], 0
+  // I32-DAG: [[MA2:%.*]] = icmp ne i32 [[MAX2]], 0
+  // I32-DAG: [[MA3:%.*]] = icmp ne i32 [[MAX3]], 0
+  // I32-DAG: [[MA4:%.*]] = icmp ne i32 [[MAX4]], 0
+  // I32-DAG: [[MA5:%.*]] = icmp ne i32 [[MAX5]], 0
+
+  // F32-DAG: [[MA0:%.*]] = fcmp fast une float [[MAX0]], 0.000000e+00
+  // F32-DAG: [[MA1:%.*]] = fcmp fast une float [[MAX1]], 0.000000e+00
+  // F32-DAG: [[MA2:%.*]] = fcmp fast une float [[MAX2]], 0.000000e+00
+  // F32-DAG: [[MA3:%.*]] = fcmp fast une float [[MAX3]], 0.000000e+00
+  // F32-DAG: [[MA4:%.*]] = fcmp fast une float [[MAX4]], 0.000000e+00
+  // F32-DAG: [[MA5:%.*]] = fcmp fast une float [[MAX5]], 0.000000e+00
+
+  // CHECK-DAG: [[MBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 128
+  // CHECK-DAG: [[MBX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // CHECK-DAG: [[MBX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+  // CHECK-DAG: [[MBX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 2
+  // CHECK-DAG: [[MBX3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 3
+  // CHECK-DAG: [[MBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 144
+  // CHECK-DAG: [[MBX4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // CHECK-DAG: [[MBX5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+
+  // I32-DAG: [[MB0:%.*]] = icmp ne i32 [[MBX0]], 0
+  // I32-DAG: [[MB1:%.*]] = icmp ne i32 [[MBX1]], 0
+  // I32-DAG: [[MB2:%.*]] = icmp ne i32 [[MBX2]], 0
+  // I32-DAG: [[MB3:%.*]] = icmp ne i32 [[MBX3]], 0
+  // I32-DAG: [[MB4:%.*]] = icmp ne i32 [[MBX4]], 0
+  // I32-DAG: [[MB5:%.*]] = icmp ne i32 [[MBX5]], 0
+
+  // F32-DAG: [[MB0:%.*]] = fcmp fast une float [[MBX0]], 0.000000e+00
+  // F32-DAG: [[MB1:%.*]] = fcmp fast une float [[MBX1]], 0.000000e+00
+  // F32-DAG: [[MB2:%.*]] = fcmp fast une float [[MBX2]], 0.000000e+00
+  // F32-DAG: [[MB3:%.*]] = fcmp fast une float [[MBX3]], 0.000000e+00
+  // F32-DAG: [[MB4:%.*]] = fcmp fast une float [[MBX4]], 0.000000e+00
+  // F32-DAG: [[MB5:%.*]] = fcmp fast une float [[MBX5]], 0.000000e+00
+
+  matrix<TYPE, 2, 3> matb = buf.Load< matrix<TYPE, 2, 3> >(128);
+  matrix<TYPE, 2, 3> mata = buf.Load< matrix<TYPE, 2, 3> >(80);
+
+  // CHECK: and i1 [[MB0]], [[MA0]]
+  // CHECK: and i1 [[MB1]], [[MA1]]
+  // CHECK: and i1 [[MB2]], [[MA2]]
+  // CHECK: and i1 [[MB3]], [[MA3]]
+  // CHECK: and i1 [[MB4]], [[MA4]]
+  // CHECK: and i1 [[MB5]], [[MA5]]
+  matrix<TYPE, 2, 3> resmat = and(mata, matb);
+
+  return float4(res3 + resmat[0] + resmat[1], res + res1.x);
+}
diff --git a/tools/clang/test/CodeGenHLSL/vector-or.hlsl b/tools/clang/test/CodeGenHLSL/vector-or.hlsl
new file mode 100644
index 0000000000..2fe6c72434
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/vector-or.hlsl
@@ -0,0 +1,164 @@
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=bool %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=bool %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=int %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=int %s | FileCheck %s --check-prefixes=CHECK,I32
+// RUN: %dxc -T ps_6_0 -HV 2021 -DTYPE=float %s | FileCheck %s --check-prefixes=CHECK,F32
+// RUN: %dxc -T ps_6_0 -HV 2018 -DTYPE=float %s | FileCheck %s --check-prefixes=CHECK,F32
+
+// I32: %dx.types.ResRet.[[TY:i32]] = type { [[TYPE:i32]]
+// F32: %dx.types.ResRet.[[TY:f32]] = type { [[TYPE:float]]
+
+// CHECK-LABEL: define void @main
+
+ByteAddressBuffer buf;
+
+float4 main() : SV_Target {
+
+  // CHECK: [[SAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 0
+  // F32: [[SAX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SAR]], 0
+  // I32: [[SA:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SAR]], 0
+
+  // CHECK: [[SBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 8
+  // F32: [[SBX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SBR]], 0
+  // I32: [[SB:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[SBR]], 0
+
+  // F32: [[SA:%.*]] = fcmp fast une float [[SAX]], 0.000000e+00
+  // F32: [[SB:%.*]] = fcmp fast une float [[SBX]], 0.000000e+00
+
+  TYPE sa = buf.Load<TYPE>(0);
+  TYPE sb = buf.Load<TYPE>(8);
+
+  // I32: or i32 [[SB]], [[SA]]
+  // F32: or i1 [[SA]], [[SB]]
+
+  TYPE res = or(sb, sa);
+
+  // CHECK: [[V1AR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 16
+  // F32: [[V1AX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1AR]], 0
+  // I32: [[V1A:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1AR]], 0
+
+  // CHECK: [[V1BR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 24
+  // F32: [[V1BX:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1BR]], 0
+  // I32: [[V1B:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V1BR]], 0
+
+  // F32: [[V1B:%.*]] = fcmp fast une float [[V1BX]], 0.000000e+00
+  // F32: [[V1A:%.*]] = fcmp fast une float [[V1AX]], 0.000000e+00
+
+  vector<TYPE, 1> v1a = buf.Load< vector<TYPE, 1> >(16);
+  vector<TYPE, 1> v1b = buf.Load< vector<TYPE, 1> >(24);
+
+  // I32: or i32 [[V1B]], [[V1A]]
+  // F32: or i1 [[V1A]], [[V1B]]
+
+  vector<TYPE, 1> res1 = or(v1a, v1b);
+
+  // CHECK: [[V3AR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 32
+  // F32: [[V3AX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 0
+  // F32: [[V3AX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 1
+  // F32: [[V3AX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 2
+
+  // I32: [[V3A0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 0
+  // I32: [[V3A1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 1
+  // I32: [[V3A2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3AR]], 2
+
+  // CHECK: [[V3BR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 56
+  // F32: [[V3BX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 0
+  // F32: [[V3BX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 1
+  // F32: [[V3BX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 2
+
+  // I32: [[V3B0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 0
+  // I32: [[V3B1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 1
+  // I32: [[V3B2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[V3BR]], 2
+
+  // F32: [[V3B0:%.*]] = fcmp fast une float [[V3BX0]], 0.000000e+00
+  // F32: [[V3B1:%.*]] = fcmp fast une float [[V3BX1]], 0.000000e+00
+  // F32: [[V3B2:%.*]] = fcmp fast une float [[V3BX2]], 0.000000e+00
+
+  // F32: [[V3A0:%.*]] = fcmp fast une float [[V3AX0]], 0.000000e+00
+  // F32: [[V3A1:%.*]] = fcmp fast une float [[V3AX1]], 0.000000e+00
+  // F32: [[V3A2:%.*]] = fcmp fast une float [[V3AX2]], 0.000000e+00
+
+  vector<TYPE, 3> v3a = buf.Load< vector<TYPE, 3> >(32);
+  vector<TYPE, 3> v3b = buf.Load< vector<TYPE, 3> >(56);
+
+  // I32: or i32 [[V3B0]], [[V3A0]]
+  // I32: or i32 [[V3B1]], [[V3A1]]
+  // I32: or i32 [[V3B2]], [[V3A2]]
+
+  // F32: or i1 [[V3A0]], [[V3B0]]
+  // F32: or i1 [[V3A1]], [[V3B1]]
+  // F32: or i1 [[V3A2]], [[V3B2]]
+
+  vector<TYPE, 3> res3 = or(v3a, v3b);
+
+  // CHECK: [[MAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 80
+  // F32: [[MAX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // F32: [[MAX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+  // F32: [[MAX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 2
+  // F32: [[MAX3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 3
+
+  // I32: [[MA0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // I32: [[MA1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+  // I32: [[MA2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 2
+  // I32: [[MA3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 3
+
+  // CHECK: [[MAR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 96
+  // F32: [[MAX4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // F32: [[MAX5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+
+  // I32: [[MA4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 0
+  // I32: [[MA5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MAR]], 1
+
+  // CHECK: [[MBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 128
+  // F32: [[MBX0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // F32: [[MBX1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+  // F32: [[MBX2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 2
+  // F32: [[MBX3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 3
+
+  // I32: [[MB0:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // I32: [[MB1:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+  // I32: [[MB2:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 2
+  // I32: [[MB3:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 3
+
+  // CHECK: [[MBR:%.*]] = call %dx.types.ResRet.[[TY]] @dx.op.bufferLoad.[[TY]](i32 68, %dx.types.Handle %{{.*}}, i32 144
+  // F32: [[MBX4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // F32: [[MBX5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+
+  // I32: [[MB4:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 0
+  // I32: [[MB5:%.*]] = extractvalue %dx.types.ResRet.[[TY]] [[MBR]], 1
+
+  // F32: [[MB0:%.*]] = fcmp fast une float [[MBX0]], 0.000000e+00
+  // F32: [[MB1:%.*]] = fcmp fast une float [[MBX1]], 0.000000e+00
+  // F32: [[MB2:%.*]] = fcmp fast une float [[MBX2]], 0.000000e+00
+  // F32: [[MB3:%.*]] = fcmp fast une float [[MBX3]], 0.000000e+00
+  // F32: [[MB4:%.*]] = fcmp fast une float [[MBX4]], 0.000000e+00
+  // F32: [[MB5:%.*]] = fcmp fast une float [[MBX5]], 0.000000e+00
+
+  // F32: [[MA0:%.*]] = fcmp fast une float [[MAX0]], 0.000000e+00
+  // F32: [[MA1:%.*]] = fcmp fast une float [[MAX1]], 0.000000e+00
+  // F32: [[MA2:%.*]] = fcmp fast une float [[MAX2]], 0.000000e+00
+  // F32: [[MA3:%.*]] = fcmp fast une float [[MAX3]], 0.000000e+00
+  // F32: [[MA4:%.*]] = fcmp fast une float [[MAX4]], 0.000000e+00
+  // F32: [[MA5:%.*]] = fcmp fast une float [[MAX5]], 0.000000e+00
+
+  matrix<TYPE, 2, 3> mata = buf.Load< matrix<TYPE, 2, 3> >(80);
+  matrix<TYPE, 2, 3> matb = buf.Load< matrix<TYPE, 2, 3> >(128);
+
+  // I32: or i32 [[MB0]], [[MA0]]
+  // I32: or i32 [[MB1]], [[MA1]]
+  // I32: or i32 [[MB2]], [[MA2]]
+  // I32: or i32 [[MB3]], [[MA3]]
+  // I32: or i32 [[MB4]], [[MA4]]
+  // I32: or i32 [[MB5]], [[MA5]]
+
+  // F32: or i1 [[MA0]], [[MB0]]
+  // F32: or i1 [[MA1]], [[MB1]]
+  // F32: or i1 [[MA2]], [[MB2]]
+  // F32: or i1 [[MA3]], [[MB3]]
+  // F32: or i1 [[MA4]], [[MB4]]
+  // F32: or i1 [[MA5]], [[MB5]]
+
+  matrix<TYPE, 2, 3> resmat = or(mata, matb);
+
+  return float4(res3 + resmat[0] + resmat[1], res + res1.x);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/ext_builtin_input.lib.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/ext_builtin_input.lib.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/ext_builtin_input.lib.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/ext_builtin_input.lib.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.both.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.both.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.both.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.both.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.enum.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.enum.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.enum.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.enum.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.flat.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.flat.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.flat.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.flat.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.nonconst.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.nonconst.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.nonconst.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.nonconst.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.nonstatic.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.nonstatic.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.input.nonstatic.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.input.nonstatic.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.output.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.output.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.output.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.output.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.output.nonstatic.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.output.nonstatic.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.output.nonstatic.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.output.nonstatic.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.builtin.redefine.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.redefine.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.builtin.redefine.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.builtin.redefine.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.capability.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.capability.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.capability.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.capability.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.executionmode.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.executionmode.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.executionmode.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.executionmode.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.executionmode.undefined.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.executionmode.undefined.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.executionmode.undefined.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.executionmode.undefined.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.extension.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.extension.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.extension.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.extension.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.extension.unused.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.extension.unused.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.extension.unused.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.extension.unused.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.type.alignment.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.alignment.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.type.alignment.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.alignment.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.type.enum-class.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.enum-class.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.type.enum-class.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.enum-class.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.type.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.type.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.inline.type.literal.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.literal.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.inline.type.literal.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.inline.type.literal.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsic.reference.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsic.reference.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsic.reference.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsic.reference.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsic.result_id.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsic.result_id.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsic.result_id.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsic.result_id.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicDecorate.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicDecorate.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicDecorate.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicDecorate.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionMode.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionMode.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionMode.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionMode.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionMode.template.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionMode.template.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionMode.template.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionMode.template.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionModeId.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicExecutionModeId.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicExecutionModeId.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicInstruction.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicInstruction.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicInstruction.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicInstruction.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteral.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteral.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteral.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteral.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteralVariable.error.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteralVariable.error.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteralVariable.error.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteralVariable.error.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteralVariable.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteralVariable.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicLiteralVariable.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicLiteralVariable.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicStorageClass.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicStorageClass.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicStorageClass.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicStorageClass.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/spv.intrinsicTypeInteger.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicTypeInteger.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/spv.intrinsicTypeInteger.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/spv.intrinsicTypeInteger.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.const.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.const.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.const.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.const.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.varpointer.hlsl b/tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.varpointer.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenSPIRV/workgroupspirvpointer.varpointer.hlsl
rename to tools/clang/test/CodeGenSPIRV/inline-spirv/workgroupspirvpointer.varpointer.hlsl
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.and.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.and.hlsl
index a983e20c45..970faf6a8d 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.and.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.and.hlsl
@@ -1,6 +1,9 @@
 // RUN: %dxc -T ps_6_0 -E main -HV 2021 -fcgl  %s -spirv | FileCheck %s
 // RUN: %dxc -T ps_6_0 -E main -HV 2018 -fcgl  %s -spirv | FileCheck %s
 
+// CHECK-DAG: [[v3_0:%[0-9]+]] = OpConstantComposite %v3int %int_0 %int_0 %int_0
+// CHECK-DAG: [[v3_1:%[0-9]+]] = OpConstantComposite %v3int %int_1 %int_1 %int_1
+
 void main() {
 // CHECK-LABEL: %bb_entry = OpLabel
 
@@ -33,4 +36,46 @@ void main() {
 // CHECK-NEXT: [[and3:%[0-9]+]] = OpLogicalAnd %bool [[a1]] [[b1]]
 // CHECK-NEXT:      {{%[0-9]+}} = OpCompositeConstruct %v2bool [[and3]] %true
     bool2 t = bool2(and(a, b), true);
+
+    int a_0, b_0, c_0;
+    // Plain assign (scalar)
+// CHECK:      [[a0_int:%[0-9]+]] = OpLoad %int %a_0
+// CHECK-NEXT: [[a0:%[0-9]+]] = OpINotEqual %bool [[a0_int]] %int_0
+// CHECK-NEXT: [[b0_int:%[0-9]+]] = OpLoad %int %b_0
+// CHECK-NEXT: [[b0:%[0-9]+]] = OpINotEqual %bool [[b0_int]] %int_0
+// CHECK-NEXT: [[and0:%[0-9]+]] = OpLogicalAnd %bool [[a0]] [[b0]]
+// CHECK-NEXT: [[sel:%[0-9]+]] = OpSelect %int [[and0]] %int_1 %int_0
+// CHECK-NEXT: OpStore %c_0 [[sel]]
+    c_0 = and(a_0, b_0);
+
+    int1 i_0, j_0, k_0;
+    int3 o_0, p_0, q_0;
+    // Plain assign (vector)
+// CHECK-NEXT: [[i0_int:%[0-9]+]] = OpLoad %int %i_0
+// CHECK-NEXT: [[i0:%[0-9]+]] = OpINotEqual %bool [[i0_int]] %int_0
+// CHECK-NEXT: [[j0_int:%[0-9]+]] = OpLoad %int %j_0
+// CHECK-NEXT: [[j0:%[0-9]+]] = OpINotEqual %bool [[j0_int]] %int_0
+// CHECK-NEXT: [[and1:%[0-9]+]] = OpLogicalAnd %bool [[i0]] [[j0]]
+// CHECK-NEXT: [[sel:%[0-9]+]] = OpSelect %int [[and1]] %int_1 %int_0
+// CHECK-NEXT: OpStore %k_0 [[sel]]
+    k_0 = and(i_0, j_0);
+
+// CHECK-NEXT: [[o0_int:%[0-9]+]] = OpLoad %v3int %o_0
+// CHECK-NEXT: [[o0:%[0-9]+]] = OpINotEqual %v3bool [[o0_int]] [[v3_0]]
+// CHECK-NEXT: [[p0_int:%[0-9]+]] = OpLoad %v3int %p_0
+// CHECK-NEXT: [[p0:%[0-9]+]] = OpINotEqual %v3bool [[p0_int]] [[v3_0]]
+// CHECK-NEXT: [[and2:%[0-9]+]] = OpLogicalAnd %v3bool [[o0]] [[p0]]
+// CHECK-NEXT: [[sel:%[0-9]+]] = OpSelect %v3int [[and2]] [[v3_1]] [[v3_0]]
+// CHECK-NEXT: OpStore %q_0 [[sel]]
+    q_0 = and(o_0, p_0);
+
+// The result of '&&' could be 'const bool'. In such cases, make sure
+// the result type is correct.
+// CHECK:      [[a0_int:%[0-9]+]] = OpLoad %int %a_0
+// CHECK-NEXT: [[a0:%[0-9]+]] = OpINotEqual %bool [[a0_int]] %int_0
+// CHECK-NEXT: [[b0_int:%[0-9]+]] = OpLoad %int %b_0
+// CHECK-NEXT: [[b0:%[0-9]+]] = OpINotEqual %bool [[b0_int]] %int_0
+// CHECK-NEXT: [[and0:%[0-9]+]] = OpLogicalAnd %bool [[a0]] [[b0]]
+// CHECK-NEXT:      {{%[0-9]+}} = OpCompositeConstruct %v2bool [[and0]] %true
+    t = bool2(and(a_0, b_0), true);
 }
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.or.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.or.hlsl
index a61272211e..39a39062f1 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.or.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.or.hlsl
@@ -1,6 +1,8 @@
 // RUN: %dxc -T ps_6_0 -E main -HV 2021 -fcgl  %s -spirv | FileCheck %s
 // RUN: %dxc -T ps_6_0 -E main -HV 2018 -fcgl  %s -spirv | FileCheck %s
 
+// CHECK: [[v3_0:%[0-9]+]] = OpConstantComposite %v3int %int_0 %int_0 %int_0
+
 void main() {
 // CHECK-LABEL: %bb_entry = OpLabel
 
@@ -25,4 +27,36 @@ void main() {
 // CHECK-NEXT: OpStore %q [[or2]]
     k = or(i, j);
     q = or(o, p);
+
+    int r, s;
+    bool t;
+    // Plain assign (scalar)
+// CHECK:      [[r0_int:%[0-9]+]] = OpLoad %int %r
+// CHECK-NEXT: [[r0:%[0-9]+]] = OpINotEqual %bool [[r0_int]] %int_0
+// CHECK-NEXT: [[s0_int:%[0-9]+]] = OpLoad %int %s
+// CHECK-NEXT: [[s0:%[0-9]+]] = OpINotEqual %bool [[s0_int]] %int_0
+// CHECK-NEXT: [[or0:%[0-9]+]] = OpLogicalOr %bool [[r0]] [[s0]]
+// CHECK-NEXT: OpStore %t [[or0]]
+    t = or(r, s);
+
+    int1 u, v;
+    bool1 w;
+    // Plain assign (vector)
+// CHECK-NEXT: [[u0_int:%[0-9]+]] = OpLoad %int %u
+// CHECK-NEXT: [[u0:%[0-9]+]] = OpINotEqual %bool [[u0_int]] %int_0
+// CHECK-NEXT: [[v0_int:%[0-9]+]] = OpLoad %int %v
+// CHECK-NEXT: [[v0:%[0-9]+]] = OpINotEqual %bool [[v0_int]] %int_0
+// CHECK-NEXT: [[or1:%[0-9]+]] = OpLogicalOr %bool [[u0]] [[v0]]
+// CHECK-NEXT: OpStore %w [[or1]]
+    w = or(u, v);
+
+    int3 x, y;
+    bool3 z;
+// CHECK-NEXT: [[x0_int:%[0-9]+]] = OpLoad %v3int %x
+// CHECK-NEXT: [[x0:%[0-9]+]] = OpINotEqual %v3bool [[x0_int]] [[v3_0]]
+// CHECK-NEXT: [[y0_int:%[0-9]+]] = OpLoad %v3int %y
+// CHECK-NEXT: [[y0:%[0-9]+]] = OpINotEqual %v3bool [[y0_int]] [[v3_0]]
+// CHECK-NEXT: [[or2:%[0-9]+]] = OpLogicalOr %v3bool [[x0]] [[y0]]
+// CHECK-NEXT: OpStore %z [[or2]]
+    z = or(x, y);
 }
diff --git a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
index d45f4bdb6e..2a143afab2 100644
--- a/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/meshshading.ext.cullprimative.hlsl
@@ -9,7 +9,7 @@ struct MeshletPrimitiveOut
 // specification says that externally visible variables cannot be bool.
 // CHECK: OpDecorate [[var:%[0-9]+]] BuiltIn CullPrimitiveEXT
 // CHECK: OpDecorate [[var]] PerPrimitiveEXT
-// CHECK: [[var]] = OpVariable %_ptr_Output__arr_uint_uint_2 Output
+// CHECK: [[var]] = OpVariable %_ptr_Output__arr_bool_uint_2 Output
 
 struct VertOut
 {
@@ -28,7 +28,7 @@ void main(uint svGroupIndex : SV_GROUPINDEX,
 
 // Make sure that the references to m_cullPrimitive use uints.
 // CHECK: [[idx:%[0-9]+]] = OpLoad %uint %gl_LocalInvocationIndex
-// CHECK: [[ac:%[0-9]+]] = OpAccessChain %_ptr_Output_uint [[var]] [[idx]]
-// CHECK: OpStore [[ac]] %uint_0
+// CHECK: [[ac:%[0-9]+]] = OpAccessChain %_ptr_Output_bool [[var]] [[idx]]
+// CHECK: OpStore [[ac]] %false
     primitives[svGroupIndex].m_cullPrimitive = false;
 }
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.load.double.capability.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.load.double.capability.hlsl
new file mode 100644
index 0000000000..535bbecfe6
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.load.double.capability.hlsl
@@ -0,0 +1,39 @@
+// RUN: %dxc -T cs_6_0 -E main -O0 %s -spirv | FileCheck %s
+
+// CHECK-NOT: OpCapability Int64
+// CHECK-DAG: OpCapability Float64
+// CHECK-NOT: OpCapability Int64
+
+RWByteAddressBuffer buffer;
+
+[numthreads(1, 1, 1)]
+void main() {
+  double tmp;
+
+// CHECK: [[addr1:%[0-9]+]] = OpShiftRightLogical %uint %uint_0 %uint_2
+// CHECK:   [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buffer %uint_0 [[addr1]]
+// CHECK: [[word0:%[0-9]+]] = OpLoad %uint [[ptr]]
+// CHECK: [[addr2:%[0-9]+]] = OpIAdd %uint [[addr1]] %uint_1
+// CHECK:   [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buffer %uint_0 [[addr2]]
+// CHECK: [[word1:%[0-9]+]] = OpLoad %uint [[ptr]]
+// CHECK: [[addr3:%[0-9]+]] = OpIAdd %uint [[addr2]] %uint_1
+// CHECK: [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0]] [[word1]]
+// CHECK: [[value:%[0-9]+]] = OpBitcast %double [[merge]]
+// CHECK:                     OpStore %tmp [[value]]
+  tmp = buffer.Load<double>(0);
+
+// CHECK: [[value:%[0-9]+]] = OpLoad %double %tmp
+// CHECK: [[merge:%[0-9]+]] = OpBitcast %v2uint [[value]]
+// CHECK: [[word0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 0
+// CHECK: [[word1:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 1
+
+// CHECK: [[addr1:%[0-9]+]] = OpShiftRightLogical %uint %uint_0 %uint_2
+// CHECK:   [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buffer %uint_0 [[addr1]]
+// CHECK:                     OpStore [[ptr]] [[word0]]
+// CHECK: [[addr2:%[0-9]+]] = OpIAdd %uint [[addr1]] %uint_1
+// CHECK:   [[ptr:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buffer %uint_0 [[addr2]]
+// CHECK:                     OpStore [[ptr]] [[word1]]
+// CHECK: [[addr3:%[0-9]+]] = OpIAdd %uint [[addr2]] %uint_1
+  buffer.Store<double>(0, tmp);
+}
+
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl
index c4ac7bca5a..7a4c968f42 100644
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl
@@ -98,53 +98,46 @@ void main(uint3 tid : SV_DispatchThreadId)
 // ********* 64-bit matrix ********************
 
 // CHECK:             [[index_1:%[0-9]+]] = OpShiftRightLogical %uint [[addr0_1:%[0-9]+]] %uint_2
-// CHECK:                 [[ptr_11:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
-// CHECK:               [[word0_2:%[0-9]+]] = OpLoad %uint [[ptr_11]]
-// CHECK:             [[index_1_2:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
-// CHECK:                 [[ptr_12:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1_2]]
-// CHECK:               [[word1_3:%[0-9]+]] = OpLoad %uint [[ptr_12]]
-// CHECK:         [[word0_ulong:%[0-9]+]] = OpUConvert %ulong [[word0_2]]
-// CHECK:         [[word1_ulong:%[0-9]+]] = OpUConvert %ulong [[word1_3]]
-// CHECK: [[word1_ulong_shifted:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1_ulong]] %uint_32
-// CHECK:          [[val0_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word0_ulong]] [[word1_ulong_shifted]]
-// CHECK:                [[val0_1:%[0-9]+]] = OpBitcast %double [[val0_ulong]]
-// CHECK:             [[index_2_2:%[0-9]+]] = OpIAdd %uint [[index_1_2]] %uint_1
-// CHECK:                 [[ptr_13:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2_2]]
-// CHECK:               [[word2_2:%[0-9]+]] = OpLoad %uint [[ptr_13]]
-// CHECK:             [[index_3_0:%[0-9]+]] = OpIAdd %uint [[index_2_2]] %uint_1
-// CHECK:                 [[ptr_14:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3_0]]
-// CHECK:               [[word3_0:%[0-9]+]] = OpLoad %uint [[ptr_14]]
-// CHECK:         [[word2_ulong:%[0-9]+]] = OpUConvert %ulong [[word2_2]]
-// CHECK:         [[word3_ulong:%[0-9]+]] = OpUConvert %ulong [[word3_0]]
-// CHECK: [[word3_ulong_shifted:%[0-9]+]] = OpShiftLeftLogical %ulong [[word3_ulong]] %uint_32
-// CHECK:          [[val1_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word2_ulong]] [[word3_ulong_shifted]]
-// CHECK:                [[val1_1:%[0-9]+]] = OpBitcast %double [[val1_ulong]]
-// CHECK:             [[index_4_0:%[0-9]+]] = OpIAdd %uint [[index_3_0]] %uint_1
-// CHECK:                 [[ptr_15:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4_0]]
-// CHECK:               [[word4_0:%[0-9]+]] = OpLoad %uint [[ptr_15]]
-// CHECK:             [[index_5_0:%[0-9]+]] = OpIAdd %uint [[index_4_0]] %uint_1
-// CHECK:                 [[ptr_16:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_5_0]]
-// CHECK:               [[word5_0:%[0-9]+]] = OpLoad %uint [[ptr_16]]
-// CHECK:         [[word4_ulong:%[0-9]+]] = OpUConvert %ulong [[word4_0]]
-// CHECK:         [[word5_ulong:%[0-9]+]] = OpUConvert %ulong [[word5_0]]
-// CHECK: [[word5_ulong_shifted:%[0-9]+]] = OpShiftLeftLogical %ulong [[word5_ulong]] %uint_32
-// CHECK:          [[val2_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word4_ulong]] [[word5_ulong_shifted]]
-// CHECK:                [[val2_1:%[0-9]+]] = OpBitcast %double [[val2_ulong]]
+// CHECK:              [[ptr_11:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
+// CHECK:             [[word0_2:%[0-9]+]] = OpLoad %uint [[ptr_11]]
+// CHECK:           [[index_1_2:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
+// CHECK:              [[ptr_12:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1_2]]
+// CHECK:             [[word1_3:%[0-9]+]] = OpLoad %uint [[ptr_12]]
+// CHECK:           [[index_2_2:%[0-9]+]] = OpIAdd %uint [[index_1_2]] %uint_1
+// CHECK:               [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0_2]] [[word1_3]]
+// CHECK:              [[val0_1:%[0-9]+]] = OpBitcast %double [[merge]]
+
+// CHECK:              [[ptr_13:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2_2]]
+// CHECK:             [[word2_2:%[0-9]+]] = OpLoad %uint [[ptr_13]]
+// CHECK:           [[index_3_0:%[0-9]+]] = OpIAdd %uint [[index_2_2]] %uint_1
+// CHECK:              [[ptr_14:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3_0]]
+// CHECK:             [[word3_0:%[0-9]+]] = OpLoad %uint [[ptr_14]]
+// CHECK:           [[index_4_0:%[0-9]+]] = OpIAdd %uint [[index_3_0]] %uint_1
+// CHECK:               [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word2_2]] [[word3_0]]
+// CHECK:              [[val1_1:%[0-9]+]] = OpBitcast %double [[merge]]
+
+// CHECK:              [[ptr_15:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4_0]]
+// CHECK:             [[word4_0:%[0-9]+]] = OpLoad %uint [[ptr_15]]
+// CHECK:           [[index_5_0:%[0-9]+]] = OpIAdd %uint [[index_4_0]] %uint_1
+// CHECK:              [[ptr_16:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_5_0]]
+// CHECK:             [[word5_0:%[0-9]+]] = OpLoad %uint [[ptr_16]]
 // CHECK:             [[index_6:%[0-9]+]] = OpIAdd %uint [[index_5_0]] %uint_1
-// CHECK:                 [[ptr_17:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_6]]
+// CHECK:               [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word4_0]] [[word5_0]]
+// CHECK:              [[val2_1:%[0-9]+]] = OpBitcast %double [[merge]]
+
+// CHECK:              [[ptr_17:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_6]]
 // CHECK:               [[word6:%[0-9]+]] = OpLoad %uint [[ptr_17]]
 // CHECK:             [[index_7:%[0-9]+]] = OpIAdd %uint [[index_6]] %uint_1
-// CHECK:                 [[ptr_18:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_7]]
+// CHECK:              [[ptr_18:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_7]]
 // CHECK:               [[word7:%[0-9]+]] = OpLoad %uint [[ptr_18]]
-// CHECK:         [[word6_ulong:%[0-9]+]] = OpUConvert %ulong [[word6]]
-// CHECK:         [[word7_ulong:%[0-9]+]] = OpUConvert %ulong [[word7]]
-// CHECK: [[word7_ulong_shifted:%[0-9]+]] = OpShiftLeftLogical %ulong [[word7_ulong]] %uint_32
-// CHECK:          [[val3_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word6_ulong]] [[word7_ulong_shifted]]
-// CHECK:                [[val3_1:%[0-9]+]] = OpBitcast %double [[val3_ulong]]
-// CHECK:                [[row0_1:%[0-9]+]] = OpCompositeConstruct %v2double [[val0_1]] [[val2_1]]
-// CHECK:                [[row1_1:%[0-9]+]] = OpCompositeConstruct %v2double [[val1_1]] [[val3_1]]
-// CHECK:              [[matrix_1:%[0-9]+]] = OpCompositeConstruct %mat2v2double [[row0_1]] [[row1_1]]
-// CHECK:                                OpStore %f64 [[matrix_1]]
+// CHECK:             [[index_8:%[0-9]+]] = OpIAdd %uint [[index_7]] %uint_1
+// CHECK:               [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word6]] [[word7]]
+// CHECK:              [[val3_1:%[0-9]+]] = OpBitcast %double [[merge]]
+
+// CHECK:              [[row0_1:%[0-9]+]] = OpCompositeConstruct %v2double [[val0_1]] [[val2_1]]
+// CHECK:              [[row1_1:%[0-9]+]] = OpCompositeConstruct %v2double [[val1_1]] [[val3_1]]
+// CHECK:            [[matrix_1:%[0-9]+]] = OpCompositeConstruct %mat2v2double [[row0_1]] [[row1_1]]
+// CHECK:                                   OpStore %f64 [[matrix_1]]
   float64_t2x2 f64 = buf.Load<float64_t2x2>(tid.x);
 
 // ********* array of matrices ********************
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl
index a1a3e7694a..96b20034b3 100644
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl
@@ -80,42 +80,34 @@ ByteAddressBuffer buf;
 
   // ********* 64-bit scalar ********************
 
-// CHECK:              [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr:%[0-9]+]]
-// CHECK:            [[word0:%[0-9]+]] = OpLoad %uint [[ptr_9]]
-// CHECK:          [[newAddr:%[0-9]+]] = OpIAdd %uint [[addr]] %uint_1
-// CHECK:              [[ptr_10:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr]]
-// CHECK:            [[word1:%[0-9]+]] = OpLoad %uint [[ptr_10]]
-// CHECK:       [[word0ULong:%[0-9]+]] = OpUConvert %ulong [[word0]]
-// CHECK:       [[word1ULong:%[0-9]+]] = OpUConvert %ulong [[word1]]
-// CHECK:[[shiftedWord1ULong:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1ULong]] %uint_32
-// CHECK:              [[val:%[0-9]+]] = OpBitwiseOr %ulong [[word0ULong]] [[shiftedWord1ULong]]
-// CHECK:                             OpStore %u64 [[val]]
+// CHECK:    [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr:%[0-9]+]]
+// CHECK:    [[word0:%[0-9]+]] = OpLoad %uint [[ptr_9]]
+// CHECK:  [[newAddr:%[0-9]+]] = OpIAdd %uint [[addr]] %uint_1
+// CHECK:   [[ptr_10:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr]]
+// CHECK:    [[word1:%[0-9]+]] = OpLoad %uint [[ptr_10]]
+// CHECK:    [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0]] [[word1]]
+// CHECK:      [[val:%[0-9]+]] = OpBitcast %ulong [[merge]]
+// CHECK:                        OpStore %u64 [[val]]
   uint64_t u64 = buf.Load<uint64_t>(tid.x);
 
-// CHECK:              [[ptr_11:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr_0:%[0-9]+]]
-// CHECK:            [[word0_0:%[0-9]+]] = OpLoad %uint [[ptr_11]]
-// CHECK:          [[newAddr_0:%[0-9]+]] = OpIAdd %uint [[addr_0]] %uint_1
-// CHECK:              [[ptr_12:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr_0]]
-// CHECK:            [[word1_0:%[0-9]+]] = OpLoad %uint [[ptr_12]]
-// CHECK:        [[word0Long:%[0-9]+]] = OpUConvert %ulong [[word0_0]]
-// CHECK:        [[word1Long:%[0-9]+]] = OpUConvert %ulong [[word1_0]]
-// CHECK: [[shiftedWord1Long:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1Long]] %uint_32
-// CHECK:        [[val_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word0Long]] [[shiftedWord1Long]]
-// CHECK:         [[val_long:%[0-9]+]] = OpBitcast %long [[val_ulong]]
-// CHECK:                             OpStore %i64 [[val_long]]
+// CHECK:     [[ptr_11:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr_0:%[0-9]+]]
+// CHECK:    [[word0_0:%[0-9]+]] = OpLoad %uint [[ptr_11]]
+// CHECK:  [[newAddr_0:%[0-9]+]] = OpIAdd %uint [[addr_0]] %uint_1
+// CHECK:     [[ptr_12:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr_0]]
+// CHECK:    [[word1_0:%[0-9]+]] = OpLoad %uint [[ptr_12]]
+// CHECK:      [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0_0]] [[word1_0]]
+// CHECK:   [[val_long:%[0-9]+]] = OpBitcast %long [[merge]]
+// CHECK:                          OpStore %i64 [[val_long]]
   int64_t i64 = buf.Load<int64_t>(tid.x);
 
-// CHECK:              [[ptr_13:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr_1:%[0-9]+]]
-// CHECK:            [[word0_1:%[0-9]+]] = OpLoad %uint [[ptr_13]]
-// CHECK:          [[newAddr_1:%[0-9]+]] = OpIAdd %uint [[addr_1]] %uint_1
-// CHECK:              [[ptr_14:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr_1]]
-// CHECK:            [[word1_1:%[0-9]+]] = OpLoad %uint [[ptr_14]]
-// CHECK:        [[word0Long_0:%[0-9]+]] = OpUConvert %ulong [[word0_1]]
-// CHECK:        [[word1Long_0:%[0-9]+]] = OpUConvert %ulong [[word1_1]]
-// CHECK: [[shiftedWord1Long_0:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1Long_0]] %uint_32
-// CHECK:        [[val_ulong_0:%[0-9]+]] = OpBitwiseOr %ulong [[word0Long_0]] [[shiftedWord1Long_0]]
-// CHECK:       [[val_double:%[0-9]+]] = OpBitcast %double [[val_ulong_0]]
-// CHECK:                             OpStore %f64 [[val_double]]
+// CHECK:      [[ptr_13:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr_1:%[0-9]+]]
+// CHECK:     [[word0_1:%[0-9]+]] = OpLoad %uint [[ptr_13]]
+// CHECK:   [[newAddr_1:%[0-9]+]] = OpIAdd %uint [[addr_1]] %uint_1
+// CHECK:      [[ptr_14:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr_1]]
+// CHECK:     [[word1_1:%[0-9]+]] = OpLoad %uint [[ptr_14]]
+// CHECK:       [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0_1]] [[word1_1]]
+// CHECK:  [[val_double:%[0-9]+]] = OpBitcast %double [[merge]]
+// CHECK:                           OpStore %f64 [[val_double]]
   double f64 = buf.Load<double>(tid.x);
 
   // ********* array of scalars *****************
@@ -124,68 +116,63 @@ ByteAddressBuffer buf;
 // CHECK:   [[index0:%[0-9]+]] = OpShiftRightLogical %uint [[addr0:%[0-9]+]] %uint_2
 // CHECK: [[byteOff0:%[0-9]+]] = OpUMod %uint [[addr0]] %uint_4
 // CHECK:  [[bitOff0:%[0-9]+]] = OpShiftLeftLogical %uint [[byteOff0]] %uint_3
-// CHECK:      [[ptr_15:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index0]]
-// CHECK:    [[word0_2:%[0-9]+]] = OpLoad %uint [[ptr_15]]
-// CHECK:    [[shift_4:%[0-9]+]] = OpShiftRightLogical %uint [[word0_2]] [[bitOff0]]
+// CHECK:   [[ptr_15:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index0]]
+// CHECK:  [[word0_2:%[0-9]+]] = OpLoad %uint [[ptr_15]]
+// CHECK:  [[shift_4:%[0-9]+]] = OpShiftRightLogical %uint [[word0_2]] [[bitOff0]]
 // CHECK:     [[val0:%[0-9]+]] = OpUConvert %ushort [[shift_4]]
 // CHECK:    [[addr1:%[0-9]+]] = OpIAdd %uint [[addr0]] %uint_2
 // CHECK:   [[index1:%[0-9]+]] = OpShiftRightLogical %uint [[addr1]] %uint_2
 // CHECK: [[byteOff1:%[0-9]+]] = OpUMod %uint [[addr1]] %uint_4
 // CHECK:  [[bitOff1:%[0-9]+]] = OpShiftLeftLogical %uint [[byteOff1]] %uint_3
-// CHECK:      [[ptr_16:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index1]]
-// CHECK:    [[word0_3:%[0-9]+]] = OpLoad %uint [[ptr_16]]
+// CHECK:   [[ptr_16:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index1]]
+// CHECK:  [[word0_3:%[0-9]+]] = OpLoad %uint [[ptr_16]]
 // CHECK: [[val1uint:%[0-9]+]] = OpShiftRightLogical %uint [[word0_3]] [[bitOff1]]
 // CHECK:     [[val1:%[0-9]+]] = OpUConvert %ushort [[val1uint]]
 // CHECK:    [[addr2:%[0-9]+]] = OpIAdd %uint [[addr1]] %uint_2
 // CHECK:   [[index2:%[0-9]+]] = OpShiftRightLogical %uint [[addr2]] %uint_2
 // CHECK: [[byteOff2:%[0-9]+]] = OpUMod %uint [[addr2]] %uint_4
 // CHECK:  [[bitOff2:%[0-9]+]] = OpShiftLeftLogical %uint [[byteOff2]] %uint_3
-// CHECK:      [[ptr_17:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index2]]
-// CHECK:    [[word1_2:%[0-9]+]] = OpLoad %uint [[ptr_17]]
-// CHECK:    [[shift_5:%[0-9]+]] = OpShiftRightLogical %uint [[word1_2]] [[bitOff2]]
+// CHECK:   [[ptr_17:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index2]]
+// CHECK:  [[word1_2:%[0-9]+]] = OpLoad %uint [[ptr_17]]
+// CHECK:  [[shift_5:%[0-9]+]] = OpShiftRightLogical %uint [[word1_2]] [[bitOff2]]
 // CHECK:     [[val2:%[0-9]+]] = OpUConvert %ushort [[shift_5]]
 // CHECK:     [[uArr:%[0-9]+]] = OpCompositeConstruct %_arr_ushort_uint_3 [[val0]] [[val1]] [[val2]]
-// CHECK:                     OpStore %uArr [[uArr]]
+// CHECK:                        OpStore %uArr [[uArr]]
   uint16_t uArr[3] = buf.Load<uint16_t[3]>(tid.x);
 
 // CHECK:     [[index_1:%[0-9]+]] = OpShiftRightLogical %uint [[addr_2:%[0-9]+]] %uint_2
-// CHECK:       [[ptr_18:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
-// CHECK: [[val0_uint:%[0-9]+]] = OpLoad %uint [[ptr_18]]
+// CHECK:      [[ptr_18:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
+// CHECK:   [[val0_uint:%[0-9]+]] = OpLoad %uint [[ptr_18]]
 // CHECK:      [[val0_0:%[0-9]+]] = OpBitcast %int [[val0_uint]]
-// CHECK:  [[newIndex:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
-// CHECK:       [[ptr_19:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newIndex]]
-// CHECK: [[val1_uint:%[0-9]+]] = OpLoad %uint [[ptr_19]]
+// CHECK:    [[newIndex:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
+// CHECK:      [[ptr_19:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newIndex]]
+// CHECK:   [[val1_uint:%[0-9]+]] = OpLoad %uint [[ptr_19]]
 // CHECK:      [[val1_0:%[0-9]+]] = OpBitcast %int [[val1_uint]]
-// CHECK:      [[iArr:%[0-9]+]] = OpCompositeConstruct %_arr_int_uint_2 [[val0_0]] [[val1_0]]
-// CHECK:                      OpStore %iArr [[iArr]]
+// CHECK:        [[iArr:%[0-9]+]] = OpCompositeConstruct %_arr_int_uint_2 [[val0_0]] [[val1_0]]
+// CHECK:                           OpStore %iArr [[iArr]]
   int iArr[2] = buf.Load<int[2]>(tid.x);
 
-// CHECK:                  [[index_0:%[0-9]+]] = OpShiftRightLogical %uint [[addr_0:%[0-9]+]] %uint_2
-// CHECK:                      [[ptr_20:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_0]]
-// CHECK:          [[val0_word0_uint:%[0-9]+]] = OpLoad %uint [[ptr_20]]
-// CHECK:                  [[index_1:%[0-9]+]] = OpIAdd %uint [[index_0]] %uint_1
-// CHECK:                      [[ptr_21:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
-// CHECK:          [[val0_word1_uint:%[0-9]+]] = OpLoad %uint [[ptr_21]]
-// CHECK:         [[val0_word0_ulong:%[0-9]+]] = OpUConvert %ulong [[val0_word0_uint]]
-// CHECK:         [[val0_word1_ulong:%[0-9]+]] = OpUConvert %ulong [[val0_word1_uint]]
-// CHECK: [[shifted_val0_word1_ulong:%[0-9]+]] = OpShiftLeftLogical %ulong [[val0_word1_ulong]] %uint_32
-// CHECK:               [[val0_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[val0_word0_ulong]] [[shifted_val0_word1_ulong]]
-// CHECK:              [[val0_double:%[0-9]+]] = OpBitcast %double [[val0_ulong]]
+// CHECK:          [[index_0:%[0-9]+]] = OpShiftRightLogical %uint [[addr_0:%[0-9]+]] %uint_2
+// CHECK:           [[ptr_20:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_0]]
+// CHECK:  [[val0_word0_uint:%[0-9]+]] = OpLoad %uint [[ptr_20]]
+// CHECK:          [[index_1:%[0-9]+]] = OpIAdd %uint [[index_0]] %uint_1
+// CHECK:           [[ptr_21:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
+// CHECK:  [[val0_word1_uint:%[0-9]+]] = OpLoad %uint [[ptr_21]]
+// CHECK:          [[index_2:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
+// CHECK:            [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[val0_word0_uint]] [[val0_word1_uint]]
+// CHECK:      [[val0_double:%[0-9]+]] = OpBitcast %double [[merge]]
+
+// CHECK:           [[ptr_22:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
+// CHECK:  [[val1_word0_uint:%[0-9]+]] = OpLoad %uint [[ptr_22]]
+// CHECK:          [[index_3:%[0-9]+]] = OpIAdd %uint [[index_2]] %uint_1
+// CHECK:           [[ptr_23:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
+// CHECK:  [[val1_word1_uint:%[0-9]+]] = OpLoad %uint [[ptr_23]]
+// CHECK:          [[index_4:%[0-9]+]] = OpIAdd %uint [[index_3]] %uint_1
+// CHECK:            [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[val1_word0_uint]] [[val1_word1_uint]]
+// CHECK:      [[val1_double:%[0-9]+]] = OpBitcast %double [[merge]]
 //
-// CHECK:                  [[index_2:%[0-9]+]] = OpIAdd %uint [[index_1]] %uint_1
-// CHECK:                      [[ptr_22:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
-// CHECK:          [[val1_word0_uint:%[0-9]+]] = OpLoad %uint [[ptr_22]]
-// CHECK:                  [[index_3:%[0-9]+]] = OpIAdd %uint [[index_2]] %uint_1
-// CHECK:                      [[ptr_23:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
-// CHECK:          [[val1_word1_uint:%[0-9]+]] = OpLoad %uint [[ptr_23]]
-// CHECK:         [[val1_word0_ulong:%[0-9]+]] = OpUConvert %ulong [[val1_word0_uint]]
-// CHECK:         [[val1_word1_ulong:%[0-9]+]] = OpUConvert %ulong [[val1_word1_uint]]
-// CHECK: [[shifted_val1_word1_ulong:%[0-9]+]] = OpShiftLeftLogical %ulong [[val1_word1_ulong]] %uint_32
-// CHECK:               [[val1_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[val1_word0_ulong]] [[shifted_val1_word1_ulong]]
-// CHECK:              [[val1_double:%[0-9]+]] = OpBitcast %double [[val1_ulong]]
-//
-// CHECK:                     [[fArr:%[0-9]+]] = OpCompositeConstruct %_arr_double_uint_2 [[val0_double]] [[val1_double]]
-// CHECK:                                     OpStore %fArr [[fArr]]
+// CHECK:             [[fArr:%[0-9]+]] = OpCompositeConstruct %_arr_double_uint_2 [[val0_double]] [[val1_double]]
+// CHECK:                                OpStore %fArr [[fArr]]
   double fArr[2] = buf.Load<double[2]>(tid.x);
 }
 
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.vector.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.vector.hlsl
index 07f76aad6a..16702c0e37 100644
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.vector.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.vector.hlsl
@@ -65,30 +65,25 @@ void main(uint3 tid : SV_DispatchThreadId)
 
 // ********* 64-bit vector ********************
 
-// CHECK:              [[index_3:%[0-9]+]] = OpShiftRightLogical %uint [[addr0_2:%[0-9]+]] %uint_2
-// CHECK:                  [[ptr_6:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
-// CHECK:                [[word0_3:%[0-9]+]] = OpLoad %uint [[ptr_6]]
-// CHECK:              [[index_1_2:%[0-9]+]] = OpIAdd %uint [[index_3]] %uint_1
-// CHECK:                  [[ptr_7:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1_2]]
-// CHECK:                [[word1_2:%[0-9]+]] = OpLoad %uint [[ptr_7]]
-// CHECK:          [[word0_ulong:%[0-9]+]] = OpUConvert %ulong [[word0_3]]
-// CHECK:          [[word1_ulong:%[0-9]+]] = OpUConvert %ulong [[word1_2]]
-// CHECK:  [[shifted_word1_ulong:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1_ulong]] %uint_32
-// CHECK:           [[val0_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word0_ulong]] [[shifted_word1_ulong]]
-// CHECK:                 [[val0_2:%[0-9]+]] = OpBitcast %double [[val0_ulong]]
-// CHECK:              [[index_2_0:%[0-9]+]] = OpIAdd %uint [[index_1_2]] %uint_1
-// CHECK:                  [[ptr_8:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2_0]]
-// CHECK:                [[word0_4:%[0-9]+]] = OpLoad %uint [[ptr_8]]
-// CHECK:              [[index_3:%[0-9]+]] = OpIAdd %uint [[index_2_0]] %uint_1
-// CHECK:                  [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
-// CHECK:                [[word1_3:%[0-9]+]] = OpLoad %uint [[ptr_9]]
-// CHECK:          [[word0_ulong_0:%[0-9]+]] = OpUConvert %ulong [[word0_4]]
-// CHECK:          [[word1_ulong_0:%[0-9]+]] = OpUConvert %ulong [[word1_3]]
-// CHECK:  [[shifted_word1_ulong_0:%[0-9]+]] = OpShiftLeftLogical %ulong [[word1_ulong_0]] %uint_32
-// CHECK:           [[val1_ulong:%[0-9]+]] = OpBitwiseOr %ulong [[word0_ulong_0]] [[shifted_word1_ulong_0]]
-// CHECK:                 [[val1_2:%[0-9]+]] = OpBitcast %double [[val1_ulong]]
-// CHECK:                 [[fVec:%[0-9]+]] = OpCompositeConstruct %v2double [[val0_2]] [[val1_2]]
-// CHECK:                                 OpStore %f64 [[fVec]]
+// CHECK:   [[index_3:%[0-9]+]] = OpShiftRightLogical %uint [[addr0_2:%[0-9]+]] %uint_2
+// CHECK:     [[ptr_6:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
+// CHECK:   [[word0_3:%[0-9]+]] = OpLoad %uint [[ptr_6]]
+// CHECK: [[index_1_2:%[0-9]+]] = OpIAdd %uint [[index_3]] %uint_1
+// CHECK:     [[ptr_7:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1_2]]
+// CHECK:   [[word1_2:%[0-9]+]] = OpLoad %uint [[ptr_7]]
+// CHECK: [[index_2_0:%[0-9]+]] = OpIAdd %uint [[index_1_2]] %uint_1
+// CHECK:     [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0_3]] [[word1_2]]
+// CHECK:    [[val0_2:%[0-9]+]] = OpBitcast %double [[merge]]
+// CHECK:     [[ptr_8:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2_0]]
+// CHECK:   [[word0_4:%[0-9]+]] = OpLoad %uint [[ptr_8]]
+// CHECK:   [[index_3:%[0-9]+]] = OpIAdd %uint [[index_2_0]] %uint_1
+// CHECK:     [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
+// CHECK:   [[word1_3:%[0-9]+]] = OpLoad %uint [[ptr_9]]
+// CHECK:   [[index_4:%[0-9]+]] = OpIAdd %uint [[index_3]] %uint_1
+// CHECK:     [[merge:%[0-9]+]] = OpCompositeConstruct %v2uint [[word0_4]] [[word1_3]]
+// CHECK:    [[val1_2:%[0-9]+]] = OpBitcast %double [[merge]]
+// CHECK:      [[fVec:%[0-9]+]] = OpCompositeConstruct %v2double [[val0_2]] [[val1_2]]
+// CHECK:                         OpStore %f64 [[fVec]]
   float64_t2 f64 = buf.Load<float64_t2>(tid.x);
 
 // ********* array of vectors ********************
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl
index 3bf947afe2..10c978e44d 100644
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl
@@ -226,18 +226,20 @@ void main(uint3 tid : SV_DispatchThreadId) {
 // The second member of S starts at byte offset 24 (6 words)
 //
 // CHECK:        [[c_addr:%[0-9]+]] = OpIAdd %uint [[base_addr]] %uint_24
-//
+
 // CHECK:             [[c:%[0-9]+]] = OpCompositeExtract %double [[s0]] 1
+// CHECK:         [[merge:%[0-9]+]] = OpBitcast %v2uint [[c]]
+// CHECK:       [[c_word0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 0
+// CHECK:       [[c_word1:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 1
+
 // CHECK:       [[c_index:%[0-9]+]] = OpShiftRightLogical %uint [[c_addr]] %uint_2
-// CHECK:           [[ptr_4:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_index]]
-// CHECK:         [[c_u64:%[0-9]+]] = OpBitcast %ulong [[c]]
-// CHECK:       [[c_word0:%[0-9]+]] = OpUConvert %uint [[c_u64]]
-// CHECK: [[c_u64_shifted:%[0-9]+]] = OpShiftRightLogical %ulong [[c_u64]] %uint_32
-// CHECK:       [[c_word1:%[0-9]+]] = OpUConvert %uint [[c_u64_shifted]]
-// CHECK:                          OpStore [[ptr_4]] [[c_word0]]
+// CHECK:         [[ptr_4:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_index]]
+// CHECK:                             OpStore [[ptr_4]] [[c_word0]]
 // CHECK:   [[c_msb_index:%[0-9]+]] = OpIAdd %uint [[c_index]] %uint_1
-// CHECK:           [[ptr_5:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_msb_index]]
-// CHECK:                          OpStore [[ptr_5]] [[c_word1]]
+
+// CHECK:         [[ptr_5:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_msb_index]]
+// CHECK:                             OpStore [[ptr_5]] [[c_word1]]
+// CHECK:    [[next_index:%[0-9]+]] = OpIAdd %uint [[c_msb_index]] %uint_1
 
 //
 // The third member of S starts at byte offset 32 (8 words)
@@ -305,16 +307,17 @@ void main(uint3 tid : SV_DispatchThreadId) {
 // CHECK:        [[b_addr:%[0-9]+]] = OpIAdd %uint [[base_addr]] %uint_48
 //
 // CHECK:             [[b:%[0-9]+]] = OpCompositeExtract %double [[s0]] 3
+// CHECK:         [[merge:%[0-9]+]] = OpBitcast %v2uint [[b]]
+// CHECK:       [[b_word0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 0
+// CHECK:       [[b_word1:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 1
+
 // CHECK:       [[b_index:%[0-9]+]] = OpShiftRightLogical %uint [[b_addr]] %uint_2
-// CHECK:           [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_index]]
-// CHECK:         [[b_u64:%[0-9]+]] = OpBitcast %ulong [[b]]
-// CHECK:       [[b_word0:%[0-9]+]] = OpUConvert %uint [[b_u64]]
-// CHECK: [[b_u64_shifted:%[0-9]+]] = OpShiftRightLogical %ulong [[b_u64]] %uint_32
-// CHECK:       [[b_word1:%[0-9]+]] = OpUConvert %uint [[b_u64_shifted]]
-// CHECK:                          OpStore [[ptr_9]] [[b_word0]]
+// CHECK:         [[ptr_9:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_index]]
+// CHECK:                             OpStore [[ptr_9]] [[b_word0]]
 // CHECK:   [[b_msb_index:%[0-9]+]] = OpIAdd %uint [[b_index]] %uint_1
-// CHECK:           [[ptr_10:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_msb_index]]
-// CHECK:                          OpStore [[ptr_10]] [[b_word1]]
+// CHECK:        [[ptr_10:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_msb_index]]
+// CHECK:                             OpStore [[ptr_10]] [[b_word1]]
+// CHECK:    [[next_index:%[0-9]+]] = OpIAdd %uint [[b_msb_index]] %uint_1
 
 //
 // The fifth member of S starts at byte offset 56 (14 words)
@@ -651,19 +654,20 @@ void main(uint3 tid : SV_DispatchThreadId) {
 //
 // The second member of S starts at byte offset 24 (6 words)
 //
-// CHECK:        [[c_addr_0:%[0-9]+]] = OpIAdd %uint [[s1_addr]] %uint_24
+// CHECK:      [[c_addr_0:%[0-9]+]] = OpIAdd %uint [[s1_addr]] %uint_24
 //
-// CHECK:             [[c_0:%[0-9]+]] = OpCompositeExtract %double [[s1]] 1
-// CHECK:       [[c_index_0:%[0-9]+]] = OpShiftRightLogical %uint [[c_addr_0]] %uint_2
-// CHECK:           [[ptr_28:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_index_0]]
-// CHECK:         [[c_u64_0:%[0-9]+]] = OpBitcast %ulong [[c_0]]
-// CHECK:       [[c_word0_0:%[0-9]+]] = OpUConvert %uint [[c_u64_0]]
-// CHECK: [[c_u64_shifted_0:%[0-9]+]] = OpShiftRightLogical %ulong [[c_u64_0]] %uint_32
-// CHECK:       [[c_word1_0:%[0-9]+]] = OpUConvert %uint [[c_u64_shifted_0]]
-// CHECK:                          OpStore [[ptr_28]] [[c_word0_0]]
-// CHECK:   [[c_msb_index_0:%[0-9]+]] = OpIAdd %uint [[c_index_0]] %uint_1
-// CHECK:           [[ptr_29:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_msb_index_0]]
-// CHECK:                          OpStore [[ptr_29]] [[c_word1_0]]
+// CHECK:           [[c_0:%[0-9]+]] = OpCompositeExtract %double [[s1]] 1
+// CHECK:         [[merge:%[0-9]+]] = OpBitcast %v2uint [[c_0]]
+// CHECK:     [[c_word0_0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 0
+// CHECK:     [[c_word1_0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 1
+
+// CHECK:     [[c_index_0:%[0-9]+]] = OpShiftRightLogical %uint [[c_addr_0]] %uint_2
+// CHECK:        [[ptr_28:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_index_0]]
+// CHECK:                             OpStore [[ptr_28]] [[c_word0_0]]
+// CHECK: [[c_msb_index_0:%[0-9]+]] = OpIAdd %uint [[c_index_0]] %uint_1
+// CHECK:        [[ptr_29:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[c_msb_index_0]]
+// CHECK:                             OpStore [[ptr_29]] [[c_word1_0]]
+// CHECK:    [[index_next:%[0-9]+]] = OpIAdd %uint [[c_msb_index_0]] %uint_1
 
 //
 // The third member of S starts at byte offset 32 (8 words)
@@ -728,19 +732,19 @@ void main(uint3 tid : SV_DispatchThreadId) {
 //
 // The fourth member of S starts at byte offset 48 (12 words)
 //
-// CHECK:        [[b_addr_0:%[0-9]+]] = OpIAdd %uint [[s1_addr]] %uint_48
-//
-// CHECK:             [[b_0:%[0-9]+]] = OpCompositeExtract %double [[s1]] 3
-// CHECK:       [[b_index_0:%[0-9]+]] = OpShiftRightLogical %uint [[b_addr_0]] %uint_2
-// CHECK:           [[ptr_33:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_index_0]]
-// CHECK:         [[b_u64_0:%[0-9]+]] = OpBitcast %ulong [[b_0]]
-// CHECK:       [[b_word0_0:%[0-9]+]] = OpUConvert %uint [[b_u64_0]]
-// CHECK: [[b_u64_shifted_0:%[0-9]+]] = OpShiftRightLogical %ulong [[b_u64_0]] %uint_32
-// CHECK:       [[b_word1_0:%[0-9]+]] = OpUConvert %uint [[b_u64_shifted_0]]
-// CHECK:                          OpStore [[ptr_33]] [[b_word0_0]]
-// CHECK:   [[b_msb_index_0:%[0-9]+]] = OpIAdd %uint [[b_index_0]] %uint_1
-// CHECK:           [[ptr_34:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_msb_index_0]]
-// CHECK:                          OpStore [[ptr_34]] [[b_word1_0]]
+// CHECK:      [[b_addr_0:%[0-9]+]] = OpIAdd %uint [[s1_addr]] %uint_48
+//
+// CHECK:           [[b_0:%[0-9]+]] = OpCompositeExtract %double [[s1]] 3
+// CHECK:         [[merge:%[0-9]+]] = OpBitcast %v2uint [[b_0]]
+// CHECK:     [[b_word0_0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 0
+// CHECK:     [[b_word1_0:%[0-9]+]] = OpCompositeExtract %uint [[merge]] 1
+// CHECK:     [[b_index_0:%[0-9]+]] = OpShiftRightLogical %uint [[b_addr_0]] %uint_2
+// CHECK:        [[ptr_33:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_index_0]]
+// CHECK:                             OpStore [[ptr_33]] [[b_word0_0]]
+// CHECK: [[b_msb_index_0:%[0-9]+]] = OpIAdd %uint [[b_index_0]] %uint_1
+// CHECK:        [[ptr_34:%[0-9]+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[b_msb_index_0]]
+// CHECK:                             OpStore [[ptr_34]] [[b_word1_0]]
+// CHECK:    [[next_index:%[0-9]+]] = OpIAdd %uint [[b_msb_index_0]] %uint_1
 
 //
 // The fifth member of S starts at byte offset 56 (14 words)
diff --git a/tools/clang/test/CodeGenSPIRV/sm6_6.descriptorheap.acbuffer.hlsl b/tools/clang/test/CodeGenSPIRV/sm6_6.descriptorheap.acbuffer.hlsl
index 053a32869a..359ac0cb70 100644
--- a/tools/clang/test/CodeGenSPIRV/sm6_6.descriptorheap.acbuffer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/sm6_6.descriptorheap.acbuffer.hlsl
@@ -9,6 +9,9 @@
 // CHECK:   OpDecorate [[resource_heap_abuffer]] Binding 0
 // CHECK:   OpDecorate [[resource_heap_abuffer_counter:%[_a-zA-Z0-9]+]] DescriptorSet 0
 // CHECK:   OpDecorate [[resource_heap_abuffer_counter]] Binding 1
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
+// CHECK:   OpDecorate %type_ACSBuffer_counter BufferBlock
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
 
 
 // CHECK-DAG:           [[ra_uint_t:%[_a-zA-Z0-9]+]] = OpTypeRuntimeArray %uint
diff --git a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.linear.hlsl b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.linear.hlsl
index cb2e8c5916..31d3b2e2cb 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.linear.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.linear.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s --check-prefix=CHECK
 
 // CHECK: OpCapability ComputeDerivativeGroupLinearKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -19,4 +20,4 @@ void main(uint3 id : SV_GroupThreadID)
     //CHECK-NEXT: [[query1:%[0-9]+]] = OpImageQueryLod %v2float [[si1]] %float_0_5
     //CHECK-NEXT:        {{%[0-9]+}} = OpCompositeExtract %float [[query1]] 0
     o[0] = t1.CalculateLevelOfDetail(ss, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.quad.hlsl b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.quad.hlsl
index 84cdcdf22a..8b4e41b768 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.quad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.compute.quad.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupQuadsKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -19,4 +20,4 @@ void main(uint3 id : SV_GroupThreadID)
     //CHECK-NEXT: [[query1:%[0-9]+]] = OpImageQueryLod %v2float [[si1]] %float_0_5
     //CHECK-NEXT:        {{%[0-9]+}} = OpCompositeExtract %float [[query1]] 0
     o[0] = t1.CalculateLevelOfDetail(ss, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.linear.hlsl b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.linear.hlsl
index efbb0d82a5..12990fbf2f 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.linear.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.linear.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupLinearKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -19,4 +20,4 @@ void main(uint3 id : SV_GroupThreadID)
     //CHECK-NEXT: [[query1:%[0-9]+]] = OpImageQueryLod %v2float [[si1]] %float_0_5
     //CHECK-NEXT:        {{%[0-9]+}} = OpCompositeExtract %float [[query1]] 1
     o[0] = t1.CalculateLevelOfDetailUnclamped(ss, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.quad.hlsl b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.quad.hlsl
index 10de7c2583..13f3060818 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.quad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.calculate.lod.unclamped.compute.quad.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupQuadsKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -19,4 +20,4 @@ void main(uint3 id : SV_GroupThreadID)
     //CHECK-NEXT: [[query1:%[0-9]+]] = OpImageQueryLod %v2float [[si1]] %float_0_5
     //CHECK-NEXT:        {{%[0-9]+}} = OpCompositeExtract %float [[query1]] 1
     o[0] = t1.CalculateLevelOfDetailUnclamped(ss, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.sample.compute.linear.hlsl b/tools/clang/test/CodeGenSPIRV/texture.sample.compute.linear.hlsl
index 0d56a3a904..d15fcda2e9 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.sample.compute.linear.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.sample.compute.linear.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupLinearKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -18,4 +19,4 @@ void main(uint3 id : SV_GroupThreadID)
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[ss]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleImplicitLod %v4float [[sampledImg]] %float_1 None
     o[0] = t1.Sample(ss, 1);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.sample.compute.quad.hlsl b/tools/clang/test/CodeGenSPIRV/texture.sample.compute.quad.hlsl
index bbedad9c66..51bcf9c850 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.sample.compute.quad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.sample.compute.quad.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupQuadsKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -18,4 +19,4 @@ void main(uint3 id : SV_GroupThreadID)
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[ss]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleImplicitLod %v4float [[sampledImg]] %float_1 None
     o[0] = t1.Sample(ss, 1);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.linear.hlsl b/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.linear.hlsl
index 2ce02bb9e4..1cd9965a89 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.linear.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.linear.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupLinearKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -13,9 +13,10 @@ Texture1D        <float>  t1;
 [numthreads(8,1,1)]
 void main(uint3 id : SV_GroupThreadID)
 {
+    Texture1D<float> local_texture = t1;
     // CHECK:              [[t1:%[0-9]+]] = OpLoad %type_1d_image %t1
     // CHECK-NEXT:   [[ss:%[0-9]+]] = OpLoad %type_sampler %ss
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[ss]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleImplicitLod %v4float [[sampledImg]] %float_1 Bias %float_0_5
-    o[0] = t1.SampleBias(ss, 1, 0.5);
-}
\ No newline at end of file
+    o[0] = local_texture.SampleBias(ss, 1, 0.5);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.quad.hlsl b/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.quad.hlsl
index a64e6f84db..aaa8d281ea 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.quad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.samplebias.compute.quad.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupQuadsKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -18,4 +19,4 @@ void main(uint3 id : SV_GroupThreadID)
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[ss]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleImplicitLod %v4float [[sampledImg]] %float_1 Bias %float_0_5
     o[0] = t1.SampleBias(ss, 1, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.linear.hlsl b/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.linear.hlsl
index a5fca1ada7..367a84db45 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.linear.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.linear.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupLinearKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -18,4 +19,4 @@ void main(uint3 id : SV_GroupThreadID)
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[scs]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleDrefImplicitLod %float [[sampledImg]] %float_1 %float_0_5
     o[0] = t1.SampleCmp(scs, 1, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.quad.hlsl b/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.quad.hlsl
index 550c80f413..96b0c356e5 100644
--- a/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.quad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/texture.samplecmp.compute.quad.hlsl
@@ -1,4 +1,5 @@
 // RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives -fcgl  %s -spirv  2>&1 | FileCheck %s
+// RUN: %dxc -T cs_6_6 -E main -fspv-extension=SPV_NV_compute_shader_derivatives %s -spirv  2>&1 | FileCheck %s
 
 // CHECK: OpCapability ComputeDerivativeGroupQuadsKHR
 // CHECK: OpExtension "SPV_NV_compute_shader_derivatives"
@@ -18,4 +19,4 @@ void main(uint3 id : SV_GroupThreadID)
     // CHECK-NEXT: [[sampledImg:%[0-9]+]] = OpSampledImage %type_sampled_image [[t1]] [[scs]]
     // CHECK-NEXT:            {{%[0-9]+}} = OpImageSampleDrefImplicitLod %float [[sampledImg]] %float_1 %float_0_5
     o[0] = t1.SampleCmp(scs, 1, 0.5);
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.counter.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.counter.hlsl
index cdf7991a12..4e56f62c1c 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.counter.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.counter.hlsl
@@ -7,8 +7,12 @@ struct PSInput
 
 // CHECK: OpDecorate %g_rwbuffer DescriptorSet 2
 // CHECK: OpDecorate %g_rwbuffer Binding 0
+// CHECK-NOT: OpDecorate %_arr_type_ACSBuffer_counter_uint_5 ArrayStride
 // CHECK: OpDecorate %counter_var_g_rwbuffer DescriptorSet 2
 // CHECK: OpDecorate %counter_var_g_rwbuffer Binding 1
+// CHECK-NOT: OpDecorate %_arr_type_ACSBuffer_counter_uint_5 ArrayStride
+// CHECK:   OpDecorate %type_ACSBuffer_counter BufferBlock
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
 
 // CHECK: %g_rwbuffer = OpVariable %_ptr_Uniform__arr_type_RWStructuredBuffer_uint_uint_5 Uniform
 // CHECK: %counter_var_g_rwbuffer = OpVariable %_ptr_Uniform__arr_type_ACSBuffer_counter_uint_5 Uniform
diff --git a/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.unbounded.counter.hlsl b/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.unbounded.counter.hlsl
index 535d79053d..1c0ea8520f 100644
--- a/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.unbounded.counter.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/type.rwstructured-buffer.array.unbounded.counter.hlsl
@@ -7,8 +7,12 @@ struct PSInput
 
 // CHECK: OpDecorate %g_rwbuffer DescriptorSet 2
 // CHECK: OpDecorate %g_rwbuffer Binding 0
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
 // CHECK: OpDecorate %counter_var_g_rwbuffer DescriptorSet 2
 // CHECK: OpDecorate %counter_var_g_rwbuffer Binding 1
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
+// CHECK:   OpDecorate %type_ACSBuffer_counter BufferBlock
+// CHECK-NOT: OpDecorate %_runtimearr_type_ACSBuffer_counter ArrayStride
 
 // CHECK: %g_rwbuffer = OpVariable %_ptr_Uniform__runtimearr_type_RWStructuredBuffer_uint Uniform
 // CHECK: %counter_var_g_rwbuffer = OpVariable %_ptr_Uniform__runtimearr_type_ACSBuffer_counter Uniform
diff --git a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
index 4d10dc446b..12b03fffda 100644
--- a/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.attribute.image-format.hlsl
@@ -60,6 +60,12 @@ RWBuffer<int64_t> Buf_r64i;
 [[vk::image_format("r64ui")]]
 RWBuffer<uint64_t> Buf_r64ui;
 
+[[vk::image_format("r16f")]]
+// CHECK: [[ImgType:%[0-9a-zA-Z_]+]] = OpTypeImage %float 2D 2 0 0 2 R16f
+// CHECK: [[ArrayType:%[0-9a-zA-Z_]+]] = OpTypeRuntimeArray [[ImgType]]
+// CHECK: [[PtrType:%[0-9a-zA-Z_]+]] = OpTypePointer UniformConstant [[ArrayType]]
+RWTexture2D<float> Buf_r16f_bindless[];
+
 struct S {
     RWBuffer<float4> b;
 };
diff --git a/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl b/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl
index ef950c28fe..81809fb948 100644
--- a/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl
+++ b/tools/clang/test/HLSLFileCheck/samples/d3d11/FluidCS11_BuildGridCS.hlsl
@@ -4,7 +4,7 @@
 // CHECK: bufferLoad
 // CHECK: FMax
 // CHECK: FMin
-// CHECK: IMad
+// CHECK: UMad
 // CHECK: bufferStore
 
 //--------------------------------------------------------------------------------------
diff --git a/tools/clang/tools/dxclib/dxc.cpp b/tools/clang/tools/dxclib/dxc.cpp
index 1bcf5d8e3f..cdcfe2b3f6 100644
--- a/tools/clang/tools/dxclib/dxc.cpp
+++ b/tools/clang/tools/dxclib/dxc.cpp
@@ -644,7 +644,7 @@ int DxcContext::VerifyRootSignature() {
   IFT(pContainerBuilder->AddPart(hlsl::DxilFourCC::DFCC_RootSignature,
                                  pRootSignature));
   CComPtr<IDxcOperationResult> pOperationResult;
-  pContainerBuilder->SerializeContainer(&pOperationResult);
+  IFT(pContainerBuilder->SerializeContainer(&pOperationResult));
   HRESULT status = E_FAIL;
   CComPtr<IDxcBlob> pResult;
   IFT(pOperationResult->GetStatus(&status));
diff --git a/tools/clang/tools/dxcvalidator/dxcvalidator.cpp b/tools/clang/tools/dxcvalidator/dxcvalidator.cpp
index b8b71ece62..60ad35036f 100644
--- a/tools/clang/tools/dxcvalidator/dxcvalidator.cpp
+++ b/tools/clang/tools/dxcvalidator/dxcvalidator.cpp
@@ -20,6 +20,7 @@
 #include "dxc/dxcapi.h"
 #include "dxcvalidator.h"
 
+#include "dxc/DXIL/DxilShaderModel.h"
 #include "dxc/DxilRootSignature/DxilRootSignature.h"
 #include "dxc/Support/FileIOHelper.h"
 #include "dxc/Support/Global.h"
@@ -32,7 +33,13 @@
 using namespace llvm;
 using namespace hlsl;
 
-static void HashAndUpdate(DxilContainerHeader *Container) {
+static void HashAndUpdate(DxilContainerHeader *Container, bool isPreRelease) {
+  if (isPreRelease) {
+    // If preview bypass is enabled, use the preview hash.
+    memcpy(Container->Hash.Digest, PreviewByPassHash.Digest,
+           sizeof(PreviewByPassHash.Digest));
+    return;
+  }
   // Compute hash and update stored hash.
   // Hash the container from this offset to the end.
   static const uint32_t DXBCHashStartOffset =
@@ -45,8 +52,26 @@ static void HashAndUpdate(DxilContainerHeader *Container) {
 
 static void HashAndUpdateOrCopy(uint32_t Flags, IDxcBlob *Shader,
                                 IDxcBlob **Hashed) {
+  bool isPreRelease = false;
+  const DxilContainerHeader *DxilContainer =
+      IsDxilContainerLike(Shader->GetBufferPointer(), Shader->GetBufferSize());
+  if (!DxilContainer)
+    return;
+
+  const DxilProgramHeader *ProgramHeader =
+      GetDxilProgramHeader(DxilContainer, DFCC_DXIL);
+
+  // ProgramHeader may be null here, when hashing a root signature container
+  if (ProgramHeader) {
+    int PV = ProgramHeader->ProgramVersion;
+    int major = (PV >> 4) & 0xF; // Extract the major version (next 4 bits)
+    int minor = PV & 0xF;        // Extract the minor version (lowest 4 bits)
+    isPreRelease = ShaderModel::IsPreReleaseShaderModel(major, minor);
+  }
+
   if (Flags & DxcValidatorFlags_InPlaceEdit) {
-    HashAndUpdate((DxilContainerHeader *)Shader->GetBufferPointer());
+    HashAndUpdate((DxilContainerHeader *)Shader->GetBufferPointer(),
+                  isPreRelease);
     *Hashed = Shader;
     Shader->AddRef();
   } else {
@@ -55,7 +80,8 @@ static void HashAndUpdateOrCopy(uint32_t Flags, IDxcBlob *Shader,
     unsigned long CB;
     IFT(HashedBlobStream->Write(Shader->GetBufferPointer(),
                                 Shader->GetBufferSize(), &CB));
-    HashAndUpdate((DxilContainerHeader *)HashedBlobStream->GetPtr());
+    HashAndUpdate((DxilContainerHeader *)HashedBlobStream->GetPtr(),
+                  isPreRelease);
     IFT(HashedBlobStream.QueryInterface(Hashed));
   }
 }
diff --git a/tools/clang/unittests/HLSL/ValidationTest.cpp b/tools/clang/unittests/HLSL/ValidationTest.cpp
index 08f67f35d0..19696de022 100644
--- a/tools/clang/unittests/HLSL/ValidationTest.cpp
+++ b/tools/clang/unittests/HLSL/ValidationTest.cpp
@@ -31,6 +31,7 @@
 #include "dxc/Support/FileIOHelper.h"
 #include "dxc/Support/Global.h"
 
+#include "dxc/DXIL/DxilShaderModel.h"
 #include "dxc/Test/DxcTestUtils.h"
 #include "dxc/Test/HlslTestUtils.h"
 
@@ -205,6 +206,7 @@ class ValidationTest : public ::testing::Test {
   TEST_METHOD(SimpleGs1Fail)
   TEST_METHOD(UavBarrierFail)
   TEST_METHOD(UndefValueFail)
+  TEST_METHOD(ValidationFailNoHash)
   TEST_METHOD(UpdateCounterFail)
   TEST_METHOD(LocalResCopy)
   TEST_METHOD(ResCounter)
@@ -300,6 +302,8 @@ class ValidationTest : public ::testing::Test {
 
   TEST_METHOD(ValidateWithHash)
   TEST_METHOD(ValidateVersionNotAllowed)
+  TEST_METHOD(ValidatePreviewBypassHash)
+  TEST_METHOD(ValidateProgramVersionAgainstDxilModule)
   TEST_METHOD(CreateHandleNotAllowedSM66)
 
   TEST_METHOD(AtomicsConsts)
@@ -537,18 +541,10 @@ class ValidationTest : public ::testing::Test {
                             pLookFors, pReplacements, pErrorMsgs, bRegex);
   }
 
-  bool RewriteAssemblyToText(IDxcBlobEncoding *pSource, LPCSTR pShaderModel,
-                             LPCWSTR *pArguments, UINT32 argCount,
-                             const DxcDefine *pDefines, UINT32 defineCount,
-                             llvm::ArrayRef<LPCSTR> pLookFors,
-                             llvm::ArrayRef<LPCSTR> pReplacements,
-                             IDxcBlob **pBlob, bool bRegex = false) {
-    CComPtr<IDxcBlob> pProgram;
-    std::string disassembly;
-    if (!CompileSource(pSource, pShaderModel, pArguments, argCount, pDefines,
-                       defineCount, &pProgram))
-      return false;
-    DisassembleProgram(pProgram, &disassembly);
+  void PerformReplacementOnDisassembly(std::string disassembly,
+                                       llvm::ArrayRef<LPCSTR> pLookFors,
+                                       llvm::ArrayRef<LPCSTR> pReplacements,
+                                       IDxcBlob **pBlob, bool bRegex = false) {
     for (unsigned i = 0; i < pLookFors.size(); ++i) {
       LPCSTR pLookFor = pLookFors[i];
       bool bOptional = false;
@@ -605,6 +601,22 @@ class ValidationTest : public ::testing::Test {
       }
     }
     Utf8ToBlob(m_dllSupport, disassembly.c_str(), pBlob);
+  }
+
+  bool RewriteAssemblyToText(IDxcBlobEncoding *pSource, LPCSTR pShaderModel,
+                             LPCWSTR *pArguments, UINT32 argCount,
+                             const DxcDefine *pDefines, UINT32 defineCount,
+                             llvm::ArrayRef<LPCSTR> pLookFors,
+                             llvm::ArrayRef<LPCSTR> pReplacements,
+                             IDxcBlob **pBlob, bool bRegex = false) {
+    CComPtr<IDxcBlob> pProgram;
+    std::string disassembly;
+    if (!CompileSource(pSource, pShaderModel, pArguments, argCount, pDefines,
+                       defineCount, &pProgram))
+      return false;
+    DisassembleProgram(pProgram, &disassembly);
+    PerformReplacementOnDisassembly(disassembly, pLookFors, pReplacements,
+                                    pBlob, bRegex);
     return true;
   }
 
@@ -1178,6 +1190,60 @@ TEST_F(ValidationTest, UavBarrierFail) {
 TEST_F(ValidationTest, UndefValueFail) {
   TestCheck(L"..\\CodeGenHLSL\\UndefValue.hlsl");
 }
+// verify that containers that are not valid DXIL do not
+// get assigned a hash.
+TEST_F(ValidationTest, ValidationFailNoHash) {
+  if (m_ver.SkipDxilVersion(1, 8))
+    return;
+  CComPtr<IDxcBlob> pProgram;
+
+  // We need any shader that will pass compilation but fail validation.
+  // This shader reads from uninitialized 'float a', which works for now.
+  LPCSTR pSource = R"(
+    float main(snorm float b : B) : SV_DEPTH
+    {
+        float a;
+        return b + a;
+    }
+)";
+
+  CComPtr<IDxcBlobEncoding> pSourceBlob;
+  Utf8ToBlob(m_dllSupport, pSource, &pSourceBlob);
+  std::vector<LPCWSTR> pArguments = {L"-Vd"};
+  LPCSTR pShaderModel = "ps_6_0";
+  bool result = CompileSource(pSourceBlob, pShaderModel, pArguments.data(), 1,
+                              nullptr, 0, &pProgram);
+
+  VERIFY_IS_TRUE(result);
+
+  CComPtr<IDxcValidator> pValidator;
+  CComPtr<IDxcOperationResult> pResult;
+  unsigned Flags = 0;
+  VERIFY_SUCCEEDED(
+      m_dllSupport.CreateInstance(CLSID_DxcValidator, &pValidator));
+
+  VERIFY_SUCCEEDED(pValidator->Validate(pProgram, Flags, &pResult));
+  HRESULT status;
+  VERIFY_IS_NOT_NULL(pResult);
+  CComPtr<IDxcBlob> pValidationOutput;
+  pResult->GetStatus(&status);
+
+  // expect validation to fail
+  VERIFY_FAILED(status);
+  pResult->GetResult(&pValidationOutput);
+  // Make sure the validation output is not null even when validation fails
+  VERIFY_SUCCEEDED(pValidationOutput != nullptr);
+
+  hlsl::DxilContainerHeader *pHeader = IsDxilContainerLike(
+      pProgram->GetBufferPointer(), pProgram->GetBufferSize());
+  VERIFY_IS_NOT_NULL(pHeader);
+
+  BYTE ZeroHash[DxilContainerHashSize] = {0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Should be equal, this proves the hash isn't written when validation fails
+  VERIFY_ARE_EQUAL(memcmp(ZeroHash, pHeader->Hash.Digest, sizeof(ZeroHash)), 0);
+}
 TEST_F(ValidationTest, UpdateCounterFail) {
   if (m_ver.SkipIRSensitiveTest())
     return;
@@ -4114,7 +4180,7 @@ TEST_F(ValidationTest, ValidatePrintfNotAllowed) {
 }
 
 TEST_F(ValidationTest, ValidateWithHash) {
-  if (m_ver.SkipDxilVersion(1, 8))
+  if (m_ver.SkipDxilVersion(1, ShaderModel::kHighestReleasedMinor))
     return;
   CComPtr<IDxcBlob> pProgram;
   CompileSource("float4 main(float a:A, float b:B) : SV_Target { return 1; }",
@@ -4149,6 +4215,113 @@ TEST_F(ValidationTest, ValidateWithHash) {
   VERIFY_ARE_EQUAL(memcmp(Result, pHeader->Hash.Digest, sizeof(Result)), 0);
 }
 
+TEST_F(ValidationTest, ValidatePreviewBypassHash) {
+  if (m_ver.SkipDxilVersion(1, ShaderModel::kHighestMinor))
+    return;
+  // If there is no available pre-release version to test, return
+  if (DXIL::CompareVersions(ShaderModel::kHighestMajor,
+                            ShaderModel::kHighestMinor,
+                            ShaderModel::kHighestReleasedMajor,
+                            ShaderModel::kHighestReleasedMinor) <= 0) {
+    return;
+  }
+
+  // Now test a pre-release version.
+  CComPtr<IDxcBlob> pProgram;
+  LPCSTR pSource =
+      R"(float4 main(float a:A, float b:B) : SV_Target { return 1; })";
+
+  CComPtr<IDxcBlobEncoding> pSourceBlob;
+  Utf8ToBlob(m_dllSupport, pSource, &pSourceBlob);
+
+  LPCSTR pShaderModel =
+      ShaderModel::Get(ShaderModel::Kind::Pixel, ShaderModel::kHighestMajor,
+                       ShaderModel::kHighestMinor)
+          ->GetName();
+
+  bool result = CompileSource(pSourceBlob, pShaderModel, nullptr, 0, nullptr, 0,
+                              &pProgram);
+  VERIFY_IS_TRUE(result);
+
+  hlsl::DxilContainerHeader *pHeader =
+      (hlsl::DxilContainerHeader *)pProgram->GetBufferPointer();
+
+  // Should be equal, this proves the hash is set to the preview bypass hash
+  // when a prerelease version is used
+  VERIFY_ARE_EQUAL(memcmp(&hlsl::PreviewByPassHash, pHeader->Hash.Digest,
+                          sizeof(hlsl::PreviewByPassHash)),
+                   0);
+}
+
+TEST_F(ValidationTest, ValidateProgramVersionAgainstDxilModule) {
+  if (m_ver.SkipDxilVersion(1, 8))
+    return;
+
+  CComPtr<IDxcBlob> pProgram;
+  LPCSTR pSource =
+      R"(float4 main(float a:A, float b:B) : SV_Target { return 1; })";
+
+  CComPtr<IDxcBlobEncoding> pSourceBlob;
+  Utf8ToBlob(m_dllSupport, pSource, &pSourceBlob);
+
+  LPCSTR pShaderModel =
+      ShaderModel::Get(ShaderModel::Kind::Pixel, 6, 0)->GetName();
+
+  bool result = CompileSource(pSourceBlob, pShaderModel, nullptr, 0, nullptr, 0,
+                              &pProgram);
+  VERIFY_IS_TRUE(result);
+
+  hlsl::DxilContainerHeader *pHeader =
+      (hlsl::DxilContainerHeader *)pProgram->GetBufferPointer();
+  // test that when the program version differs from the dxil module shader
+  // model version, the validator fails
+  DxilPartHeader *pPart = GetDxilPartByType(pHeader, DxilFourCC::DFCC_DXIL);
+
+  DxilProgramHeader *pMutableProgramHeader =
+      reinterpret_cast<DxilProgramHeader *>(GetDxilPartData(pPart));
+  int oldMajor = 0;
+  int oldMinor = 0;
+  int newMajor = 0;
+  int newMinor = 0;
+  VERIFY_IS_NOT_NULL(pMutableProgramHeader);
+  uint32_t &PV = pMutableProgramHeader->ProgramVersion;
+  oldMajor = (PV >> 4) & 0xF; // Extract the major version (next 4 bits)
+  oldMinor = PV & 0xF;        // Extract the minor version (lowest 4 bits)
+
+  // Add one to the last bit of the program version, which is 0, because
+  // the program version (shader model version) is 6.0, and we want to
+  // test that the validation fails when the program version is changed to 6.1
+  PV += 1;
+
+  newMajor = (PV >> 4) & 0xF; // Extract the major version (next 4 bits)
+  newMinor = PV & 0xF;        // Extract the new minor version (lowest 4 bits)
+
+  // now test that the validation fails
+  CComPtr<IDxcValidator> pValidator;
+  CComPtr<IDxcOperationResult> pResult;
+  unsigned Flags = 0;
+  VERIFY_SUCCEEDED(
+      m_dllSupport.CreateInstance(CLSID_DxcValidator, &pValidator));
+
+  HRESULT status;
+  VERIFY_SUCCEEDED(pValidator->Validate(pProgram, Flags, &pResult));
+  VERIFY_IS_NOT_NULL(pResult);
+  pResult->GetStatus(&status);
+
+  // expect validation to fail
+  VERIFY_FAILED(status);
+  // validation succeeded prior, so by inference we know that oldMajor /
+  // oldMinor were the old dxil module shader model versions
+  char buffer[100];
+  std::snprintf(buffer, sizeof(buffer),
+                "error: Program Version is %d.%d but Dxil Module shader model "
+                "version is %d.%d.\nValidation failed.\n",
+                newMajor, newMinor, oldMajor, oldMinor);
+  std::string formattedString = buffer;
+
+  CheckOperationResultMsgs(pResult, {buffer}, false, false);
+}
+
 TEST_F(ValidationTest, ValidateVersionNotAllowed) {
   if (m_ver.SkipDxilVersion(1, 6))
     return;
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f22e99e467..7066247883 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -819,6 +819,15 @@ class ExecutionTest {
 
         return false;
       }
+
+      if (GetModuleHandle("d3d10warp.dll") != NULL) {
+        CHAR szFullModuleFilePath[MAX_PATH] = "";
+        GetModuleFileName(GetModuleHandle("d3d10warp.dll"),
+                          szFullModuleFilePath, sizeof(szFullModuleFilePath));
+        WEX::Logging::Log::Comment(WEX::Common::String().Format(
+            L"WARP driver loaded from: %S", szFullModuleFilePath));
+      }
+
     } else {
       CComPtr<IDXGIAdapter1> hardwareAdapter;
       WEX::Common::String AdapterValue;
@@ -6871,22 +6880,6 @@ ToleranceType ToleranceStringToEnum(LPCWSTR toleranceType) {
   }
 }
 
-static bool CompareOutputWithExpectedValueFloat(
-    float output, float ref, ToleranceType type, double tolerance,
-    hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
-  if (type == ToleranceType::RELATIVE_EPSILON) {
-    return CompareFloatRelativeEpsilon(output, ref, (int)tolerance, mode);
-  } else if (type == ToleranceType::EPSILON) {
-    return CompareFloatEpsilon(output, ref, (float)tolerance, mode);
-  } else if (type == ToleranceType::ULP) {
-    return CompareFloatULP(output, ref, (int)tolerance, mode);
-  } else {
-    LogErrorFmt(L"Failed to read comparison type %S", type);
-  }
-
-  return false;
-}
-
 static bool CompareOutputWithExpectedValueFloat(
     float output, float ref, LPCWSTR type, double tolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
@@ -6910,21 +6903,6 @@ static bool VerifyOutputWithExpectedValueFloat(
       CompareOutputWithExpectedValueFloat(output, ref, type, tolerance, mode));
 }
 
-static bool CompareOutputWithExpectedValueHalf(uint16_t output, uint16_t ref,
-                                               ToleranceType type,
-                                               double tolerance) {
-  if (type == ToleranceType::RELATIVE_EPSILON) {
-    return CompareHalfRelativeEpsilon(output, ref, (int)tolerance);
-  } else if (type == ToleranceType::EPSILON) {
-    return CompareHalfEpsilon(output, ref, (float)tolerance);
-  } else if (type == ToleranceType::ULP) {
-    return CompareHalfULP(output, ref, (float)tolerance);
-  } else {
-    LogErrorFmt(L"Failed to read comparison type %S", type);
-    return false;
-  }
-}
-
 static bool CompareOutputWithExpectedValueHalf(uint16_t output, uint16_t ref,
                                                LPCWSTR type, double tolerance) {
   if (_wcsicmp(type, L"Relative") == 0) {
@@ -6945,29 +6923,6 @@ static bool VerifyOutputWithExpectedValueHalf(uint16_t output, uint16_t ref,
       CompareOutputWithExpectedValueHalf(output, ref, type, tolerance));
 }
 
-template <typename T>
-static bool CompareOutputWithExpectedValue(T output, T ref,
-                                           ToleranceType toleranceType,
-                                           double tolerance) {
-  if (std::is_same<T, DirectX::PackedVector::HALF>::value) { // uint16 treated
-                                                             // as half
-    return CompareOutputWithExpectedValueHalf((uint16_t)output, (uint16_t)ref,
-                                              toleranceType, tolerance);
-  } else if (std::is_integral<T>::value &&
-             std::is_signed<T>::value) { // signed ints
-    return CompareOutputWithExpectedValueInt((int)output, (int)ref,
-                                             (int)tolerance);
-  } else if (std::is_integral<T>::value) { // unsigned ints
-    return CompareOutputWithExpectedValueUInt((uint32_t)output, (uint32_t)ref,
-                                              (uint32_t)tolerance);
-  } else if (std::is_floating_point<T>::value) { // floating point
-    return CompareOutputWithExpectedValueFloat((float)output, (float)ref,
-                                               toleranceType, tolerance);
-  }
-
-  DXASSERT_NOMSG("Invalid Parameter Type");
-}
-
 template <typename T>
 static bool CompareOutputWithExpectedValue(T output, T ref,
                                            LPCWSTR toleranceType,
diff --git a/utils/git/requirements_formatting.txt b/utils/git/requirements_formatting.txt
index 546662aa33..06db8176c9 100644
--- a/utils/git/requirements_formatting.txt
+++ b/utils/git/requirements_formatting.txt
@@ -18,7 +18,7 @@ charset-normalizer==3.2.0
     # via requests
 click==8.1.7
     # via black
-cryptography==42.0.4
+cryptography==43.0.1
     # via pyjwt
 darker==1.7.2
     # via -r llvm/utils/git/requirements_formatting.txt.in
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 40b5f6d96a..7f7637b230 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -124,7 +124,7 @@ $match<0, 1> float_like [[rn]] determinant(in float_like<r, r> x);
 void [[]] DeviceMemoryBarrier() : syncdevicememory_ug;
 void [[]] DeviceMemoryBarrierWithGroupSync() : syncgroupanddevicememory_ug;
 $match<0, 1> float_like [[rn]] distance(in float_like<c> a, in $type1 b);
-$match<0, 1> numeric [[rn]] dot(in numeric<c> a, in $type1 b);
+$match<0, 1> numeric [[rn,unsigned_op=udot]] dot(in numeric<c> a, in $type1 b);
 $type1 [[rn]] dst(in numeric<4> a, in $type1 b);
 // void errorf(in string Format, ...);
 $type1 [[rn]] EvaluateAttributeAtSample(in numeric<> value, in uint index);
@@ -198,13 +198,13 @@ $type1 [[rn,unsigned_op=umax]] max(in numeric<> a, in $type1 b);
 $type1 [[rn,unsigned_op=umin]] min(in numeric<> a, in $type1 b);
 $type1 [[]] modf(in float_like<> x, out $type1 ip);
 uint<4> [[rn]] msad4(in uint reference, in uint<2> source, in uint<4> accum);
-numeric [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric b) : mul_ss;
-numeric<c2> [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<c2> b) : mul_sv;
-numeric<r2, c2> [[rn]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<r2, c2> b) : mul_sm;
-numeric<c> [[rn]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric b) : mul_vs;
-numeric [[rn]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric<c> b) : mul_vv;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric b) : mul_ss;
+numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<c2> b) : mul_sv;
+numeric<r2, c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<r2, c2> b) : mul_sm;
+numeric<c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric b) : mul_vs;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric<c> b) : mul_vv;
 numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_vm;
-numeric<r, c> [[rn]] mul(in $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric b) : mul_ms;
+numeric<r, c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric b) : mul_ms;
 numeric<r> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric<c> b) : mul_mv;
 numeric<r, c2> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_mm;
 $type1 [[rn]] normalize(in float_like<c> x);
@@ -369,8 +369,8 @@ resource [[hidden]] CreateResourceFromHeap(in uint index);
 // Replacement for vector logical &&, ||, and ternary conditional operators,
 // For use when HLSL changes to support short-circuiting and only scalar
 // conditions to maintain clarity.
-$match<1, 0> bool<> [[rn]] and(in any<> x, in $type1 y);
-$match<1, 0> bool<> [[rn]] or(in any<> x, in $type1 y);
+$match<1, 0> bool<> [[rn]] and(in bool<> x, in $type1 y);
+$match<1, 0> bool<> [[rn]] or(in bool<> x, in $type1 y);
 $type2 [[rn]] select(in bool<> cond, in $match<1, 2> any<> t, in $type2 f);
 $type2 [[rn]] select(in bool cond, in any_sampler t, in $type2 f);
 
diff --git a/utils/hct/hctbuild.cmd b/utils/hct/hctbuild.cmd
index 19e32c4731..ff6dbfa22a 100644
--- a/utils/hct/hctbuild.cmd
+++ b/utils/hct/hctbuild.cmd
@@ -46,6 +46,7 @@ set WINSDK_MIN_VERSION=10.0.17763.0
 set INSTALL_DIR=
 set DEFAULT_EXEC_ADAPTER=-DTAEF_EXEC_ADAPTER=
 set LIT_ARGS=
+set FRESH=
 
 :parse_args
 if "%1"=="" (
@@ -208,6 +209,10 @@ if "%1"=="-sanitizer" (
   set CMAKE_OPTS=%CMAKE_OPTS% -DLLVM_USE_SANITIZER:STRING=Address
   shift /1 & goto :parse_args
 )
+if "%1"=="-fresh" (
+  set FRESH="--fresh"
+  shift /1 & goto :parse_args
+)
 
 
 rem Begin SPIRV change
@@ -468,13 +473,13 @@ cd /d %3
 if "%DO_SETUP%"=="1" (
   echo Creating solution files for %2, logging to %3\cmake-log.txt
   if "%BUILD_GENERATOR%"=="Ninja" (
-    echo Running "%CMAKE_PATH%" -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %HLSL_SRC_DIR% > %3\cmake-log.txt
-    "%CMAKE_PATH%" -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %HLSL_SRC_DIR% >> %3\cmake-log.txt 2>&1
+    echo Running "%CMAKE_PATH%" %FRESH% -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %HLSL_SRC_DIR% > %3\cmake-log.txt
+    "%CMAKE_PATH%" %FRESH% -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %HLSL_SRC_DIR% >> %3\cmake-log.txt 2>&1
   ) else (
     rem BUILD_TYPE is mostly ignored in this path as VS generates multiple targets
     rem it is still needed to satisfy cmake file expectations
-    echo Running "%CMAKE_PATH%" -DCMAKE_BUILD_TYPE:STRING=%1  %CMAKE_OPTS% -G %4 %5 %HLSL_SRC_DIR% > %3\cmake-log.txt
-    "%CMAKE_PATH%" -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %5 %HLSL_SRC_DIR% >> %3\cmake-log.txt 2>&1
+    echo Running "%CMAKE_PATH%" %FRESH% -DCMAKE_BUILD_TYPE:STRING=%1  %CMAKE_OPTS% -G %4 %5 %HLSL_SRC_DIR% > %3\cmake-log.txt
+    "%CMAKE_PATH%" %FRESH% -DCMAKE_BUILD_TYPE:STRING=%1 %CMAKE_OPTS% -G %4 %5 %HLSL_SRC_DIR% >> %3\cmake-log.txt 2>&1
   )
   if %SHOW_CMAKE_LOG%==1 (
     echo ------- Start of %3\cmake-log.txt -------
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 19220d6d1a..2f632aceee 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -7500,6 +7500,11 @@ def build_valrules(self):
             "Target shader model requires specific Dxil Version",
             "Shader model requires Dxil Version %0.%1.",
         )
+        self.add_valrule_msg(
+            "Sm.ProgramVersion",
+            "Program Version in Dxil Container does not match Dxil Module shader model version",
+            "Program Version is %0.%1 but Dxil Module shader model version is %2.%3.",
+        )
         self.add_valrule_msg(
             "Sm.Opcode",
             "Opcode must be defined in target shader model",
diff --git a/utils/hct/hctdb_instrhelp.py b/utils/hct/hctdb_instrhelp.py
index 919610e420..17eefd4918 100644
--- a/utils/hct/hctdb_instrhelp.py
+++ b/utils/hct/hctdb_instrhelp.py
@@ -4,6 +4,8 @@
 import functools
 import collections
 from hctdb import *
+import json
+import os
 
 # get db singletons
 g_db_dxil = None
@@ -1536,10 +1538,20 @@ def get_interpretation_table():
     return run_with_stdout(lambda: gen.print_interpretation_table())
 
 
+# highest minor is different than highest released minor,
+# since there can be pre-release versions that are higher
+# than the last released version
 highest_major = 6
-highest_minor = 8
+highest_minor = 9
 highest_shader_models = {4: 1, 5: 1, 6: highest_minor}
 
+# fetch the last released version from latest-released.json
+json_path = os.path.dirname(os.path.dirname(__file__)) + "/version/latest-release.json"
+with open(json_path, "r") as file:
+    json_data = json.load(file)
+
+highest_released_minor = int(json_data["version"]["minor"])
+
 
 def getShaderModels():
     shader_models = []
@@ -1550,6 +1562,14 @@ def getShaderModels():
     return shader_models
 
 
+def get_highest_released_shader_model():
+    result = """static const unsigned kHighestReleasedMajor = %d;
+static const unsigned kHighestReleasedMinor = %d;""" % (
+        highest_major,
+        highest_released_minor,
+    )
+    return result
+
 def get_highest_shader_model():
     result = """static const unsigned kHighestMajor = %d;
 static const unsigned kHighestMinor = %d;""" % (
@@ -1558,7 +1578,6 @@ def get_highest_shader_model():
     )
     return result
 
-
 def get_dxil_version_minor():
     return "const unsigned kDxilMinor = %d;" % highest_minor
 
diff --git a/utils/hct/hcttest.cmd b/utils/hct/hcttest.cmd
index 1810efcee4..3e536d199f 100644
--- a/utils/hct/hcttest.cmd
+++ b/utils/hct/hcttest.cmd
@@ -37,6 +37,8 @@ set TEST_MANUAL_FILE_CHECK=0
 set SINGLE_FILE_CHECK_NAME=0
 set CUSTOM_BIN_SET=
 set USE_AGILITY_SDK=
+set USE_WARP_FROM_NUGET=
+set EXEC_TEST_TARGET="check-clang-taef-exec"
 
 rem Begin SPIRV change
 set TEST_SPIRV=0
@@ -132,6 +134,22 @@ if "%1"=="-clean" (
   set TEST_ALL=0
   set TEST_EXEC=1
   set TEST_EXEC_REQUIRED=1
+) else if "%1"=="exec-warp" (
+  rem If exec-warp is explicitly supplied, hcttest will fail if machine is not configured
+  rem to run execution tests, otherwise, execution tests would be skipped.
+  set TEST_ALL=0
+  set TEST_EXEC=1
+  set USE_WARP_FROM_NUGET=LATEST_RELEASE
+  set TEST_EXEC_REQUIRED=1
+  set EXEC_TEST_TARGET="check-clang-taef-exec-warp"
+) else if "%1"=="exec-warp-preview" (
+  rem If exec-warp-preview is explicitly supplied, hcttest will fail if machine is not configured
+  rem to run execution tests, otherwise, execution tests would be skipped.
+  set TEST_ALL=0
+  set TEST_EXEC=1
+  set USE_WARP_FROM_NUGET=LATEST_PREVIEW
+  set TEST_EXEC_REQUIRED=1
+  set EXEC_TEST_TARGET="check-clang-taef-exec-warp"
 ) else if "%1"=="exec-filter" (
   set TEST_ALL=0
   set TEST_EXEC=1
@@ -333,21 +351,22 @@ if "%TEST_USE_LIT%"=="1" (
       if defined EXEC_ADAPTER (
         py %HLSL_SRC_DIR%/utils/lit/lit.py -v --no-progress-bar --param build_mode=%BUILD_CONFIG% --param clang_site_config=%HLSL_BLD_DIR%/tools/clang/test/lit.site.cfg --param clang_taef_exec_site_config=%HLSL_BLD_DIR%/tools/clang/test/taef_exec/lit.site.cfg %EXEC_ADAPTER% %HLSL_SRC_DIR%/tools/clang/test/taef_exec
       ) else (
-        cmake --build %HLSL_BLD_DIR% --config %BUILD_CONFIG% --target check-clang-taef-exec
+        cmake --build %HLSL_BLD_DIR% --config %BUILD_CONFIG% --target %EXEC_TEST_TARGET%
 	  )
       set RES_EXEC=!ERRORLEVEL!
     )
   )
-  set TEST_CLANG=0
-  set TEST_DXILCONV=0
-  set TEST_SPIRV=0
-  set TEST_EXEC=0
-  set TEST_CMD=0
 
   rem No other tests to run - skip copying and move on to report the results
   if not exist "%HCT_EXTRAS%\hcttest-extras.cmd" (
     goto :report_results
   )
+
+  set TEST_CLANG=0
+  set TEST_DXILCONV=0
+  set TEST_SPIRV=0
+  set TEST_EXEC=0
+  set TEST_CMD=0
 )
 
 if not exist %TEST_DIR% (mkdir %TEST_DIR%)
diff --git a/utils/version/latest-release.json b/utils/version/latest-release.json
index 04a3881343..3138ccd2b1 100644
--- a/utils/version/latest-release.json
+++ b/utils/version/latest-release.json
@@ -1,8 +1,8 @@
 {
-    "version" : {
-        "major" : "1",
-        "minor" : "8",
-        "rev"   : "2407"
+    "version": {
+        "major": "1",
+        "minor": "8",
+        "rev": "2502"
     },
-    "sha" : "737a12a663f1697d3755a522d8fbf30481ecd2f6"
+    "sha": "070d0d5a2beacef9eeb51037a9b04665716fd6f3"
 }
diff --git a/utils/version/version.inc b/utils/version/version.inc
index 4e5d9f7313..2577daa529 100644
--- a/utils/version/version.inc
+++ b/utils/version/version.inc
@@ -18,7 +18,7 @@
 #ifdef RC_VERSION_FIELD_3
 #undef RC_VERSION_FIELD_3
 #endif
-#define RC_VERSION_FIELD_3 2407
+#define RC_VERSION_FIELD_3 2502
 
 #ifdef RC_VERSION_FIELD_4
 #undef RC_VERSION_FIELD_4
@@ -28,7 +28,7 @@
 #ifdef RC_FILE_VERSION
 #undef RC_FILE_VERSION
 #endif
-#define RC_FILE_VERSION "1.8.2407.0"
+#define RC_FILE_VERSION "1.8.2502.0"
 
 #ifdef RC_FILE_DESCRIPTION
 #undef RC_FILE_DESCRIPTION
@@ -49,7 +49,7 @@
 #ifdef RC_PRODUCT_VERSION
 #undef RC_PRODUCT_VERSION
 #endif
-#define RC_PRODUCT_VERSION "1.8.2407.0"
+#define RC_PRODUCT_VERSION "1.8.2502.0"
 
 #ifdef HLSL_TOOL_NAME
 #undef HLSL_TOOL_NAME