diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h --- a/llvm/include/llvm/MC/MCSubtargetInfo.h +++ b/llvm/include/llvm/MC/MCSubtargetInfo.h @@ -230,6 +230,10 @@ return Found != ProcDesc.end() && StringRef(Found->Key) == CPU; } + ArrayRef getAllProcessorDescriptions() const { + return ProcDesc; + } + virtual unsigned getHwMode() const { return 0; } /// Return the cache size in bytes for the given level of cache. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -47,6 +47,7 @@ FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); +FunctionPass *createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); @@ -287,6 +288,9 @@ void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; +void initializeAMDGPUClearIncompatibleFunctionsPass(PassRegistry &); +extern char &AMDGPUClearIncompatibleFunctionsID; + void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); extern char &AMDGPULateCodeGenPrepareID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp @@ -0,0 +1,169 @@ +//===-- AMDGPUClearIncompatibleFunctions.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass replaces the bodies of functions that use GPU features +/// incompatible with the current GPU with trap/unreachable. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "amdgpu-clear-incompatible-functions" + +using namespace llvm; + +namespace llvm { +extern const SubtargetFeatureKV + AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures - 1]; +} + +namespace { + +using Generation = AMDGPUSubtarget::Generation; + +class AMDGPUClearIncompatibleFunctions : public FunctionPass { +public: + static char ID; + + AMDGPUClearIncompatibleFunctions(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + assert(TM && "No TargetMachine!"); + } + + StringRef getPassName() const override { + return "AMDGPU Clear Incompatible Functions Bodies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // If changes are made, no analyses are preserved. + } + + bool runOnFunction(Function &F) override; + +private: + const TargetMachine *TM = nullptr; +}; + +StringRef GetFeatureName(unsigned Feature) { + for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) + if (Feature == KV.Value) + return KV.Key; + + llvm_unreachable("Unknown Target feature"); +} + +const SubtargetSubTypeKV *GetGPUInfo(const GCNSubtarget &ST, + StringRef GPUName) { + for (const SubtargetSubTypeKV &KV : ST.getAllProcessorDescriptions()) + if (StringRef(KV.Key) == GPUName) + return &KV; + + return nullptr; +} + +constexpr unsigned FeaturesToCheck[] = { + AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureGFX10Insts, + AMDGPU::FeatureGFX9Insts, AMDGPU::FeatureGFX8Insts, + AMDGPU::FeatureDPP, AMDGPU::Feature16BitInsts, + AMDGPU::FeatureDot1Insts, AMDGPU::FeatureDot2Insts, + AMDGPU::FeatureDot3Insts, AMDGPU::FeatureDot4Insts, + AMDGPU::FeatureDot5Insts, AMDGPU::FeatureDot6Insts, + AMDGPU::FeatureDot7Insts, AMDGPU::FeatureDot8Insts, +}; + +FeatureBitset ExpandImpliedFeatures(const FeatureBitset &Features) { + FeatureBitset Result = Features; + for (const SubtargetFeatureKV &FE : AMDGPUFeatureKV) + if (Features.test(FE.Value) && FE.Implies.any()) + Result |= ExpandImpliedFeatures(FE.Implies.getAsBitset()); + return Result; +} + +} // end anonymous namespace + +bool AMDGPUClearIncompatibleFunctions::runOnFunction(Function &F) { + if (skipFunction(F) || F.isDeclaration()) + return false; + + // This pass is primarily intended for GCN, so check we have a GCN GPU. + if (!TM->getTargetTriple().isAMDGCN()) + return false; + + const GCNSubtarget *ST = + static_cast(TM->getSubtargetImpl(F)); + + // Additionally check our GPU isn't the generic one. The generic one is used + // for testing only and we don't want this pass to interfere with it. + StringRef GPUName = ST->getCPU(); + if (GPUName.empty() || GPUName.contains("generic")) + return false; + + // Try to fetch the GPU's info. If we can't, it's likely an unknown processor + // so just bail out. + const SubtargetSubTypeKV *GPUInfo = GetGPUInfo(*ST, GPUName); + if (!GPUInfo) + return false; + + LLVMContext &Ctx = F.getContext(); + + // Get all the features implied by the current GPU, and recursively expand + // the features that imply other features. + // + // e.g. GFX90A implies FeatureGFX9, and FeatureGFX9 implies a whole set of + // other features. + const FeatureBitset GPUFeatureBits = + ExpandImpliedFeatures(GPUInfo->Implies.getAsBitset()); + + // Now that the have a FeatureBitset containing all possible features for + // the chosen GPU, check our list of "suspicious" features. + + // Check that the user didn't enable any features that aren't part of that + // GPU's feature set. We only check a predetermined set of features. + bool Remove = false; + for (unsigned Feature : FeaturesToCheck) { + if (ST->hasFeature(Feature) && !GPUFeatureBits.test(Feature)) { + Remove = true; + std::string Msg = + "+" + GetFeatureName(Feature).str() + + " is not supported on the current target. Deleting function body."; + DiagnosticInfoUnsupported DiagInfo(F, Msg, DiagnosticLocation(), + DS_Warning); + Ctx.diagnose(DiagInfo); + } + } + + if (!Remove) + return false; + + F.dropAllReferences(); + assert(F.empty()); + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", &F); + IRBuilder<> Builder(Entry); + Builder.CreateIntrinsic(Intrinsic::trap, {}, {}); + Builder.CreateUnreachable(); + return true; +} + +INITIALIZE_PASS(AMDGPUClearIncompatibleFunctions, DEBUG_TYPE, + "AMDGPU Clear Incompatible Functions Bodies", false, false) + +char AMDGPUClearIncompatibleFunctions::ID = 0; + +FunctionPass * +llvm::createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *TM) { + return new AMDGPUClearIncompatibleFunctions(TM); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -213,6 +213,12 @@ cl::init(false), cl::Hidden); +static cl::opt ClearIncompatibleFunctionsBodies( + "amdgpu-incompatible-features-clear-fns", cl::Hidden, + cl::desc("Enable deletion of function bodies when they" + "use features not supported by the target GPU"), + cl::init(true)); + static cl::opt EnableSDWAPeephole( "amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), @@ -376,6 +382,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPUClearIncompatibleFunctionsPass(*PR); initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -1058,6 +1065,11 @@ bool AMDGPUPassConfig::addPreISel() { if (TM->getOptLevel() > CodeGenOpt::None) addPass(createFlattenCFGPass()); + + if (ClearIncompatibleFunctionsBodies) + addPass( + createAMDGPUClearIncompatibleFunctionsPass(&getAMDGPUTargetMachine())); + return false; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -50,6 +50,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp AMDGPUCallLowering.cpp + AMDGPUClearIncompatibleFunctions.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s +; RUN: llc -global-isel -amdgpu-incompatible-features-clear-fns=0 -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s ; Make sure legalizer info doesn't assert on dummy targets diff --git a/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll @@ -0,0 +1,1025 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX7 %s +; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t + +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX8 %s +; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t + +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX906 %s +; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t + +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX90A %s +; RUN: FileCheck --check-prefix=WARN-GFX90A %s < %t + +; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX10 %s +; RUN: FileCheck --check-prefix=WARN-GFX10 %s < %t + +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX11 %s +; RUN: FileCheck --check-prefix=WARN-GFX11 %s < %t + +; WARN-GFX7: needs_dpp {{.*}} +dpp is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_16bit_insts {{.*}} +16-bit-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_gfx8_insts {{.*}} +gfx8-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_gfx9_insts {{.*}} +gfx9-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot1_insts {{.*}} +dot1-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot2_insts {{.*}} +dot2-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot3_insts {{.*}} +dot3-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot4_insts {{.*}} +dot4-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot5_insts {{.*}} +dot5-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot6_insts {{.*}} +dot6-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot7_insts {{.*}} +dot7-insts is not supported on the current target. Deleting function body. +; WARN-GFX7: needs_dot8_insts {{.*}} +dot8-insts is not supported on the current target. Deleting function body. +; WARN-GFX7-NOT: not supported + +; WARN-GFX8: needs_gfx9_insts {{.*}} +gfx9-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot1_insts {{.*}} +dot1-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot2_insts {{.*}} +dot2-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot3_insts {{.*}} +dot3-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot4_insts {{.*}} +dot4-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot5_insts {{.*}} +dot5-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot6_insts {{.*}} +dot6-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot7_insts {{.*}} +dot7-insts is not supported on the current target. Deleting function body. +; WARN-GFX8: needs_dot8_insts {{.*}} +dot8-insts is not supported on the current target. Deleting function body. +; WARN-GFX8-NOT: not supported + +; WARN-GFX906: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_dot3_insts {{.*}} +dot3-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_dot4_insts {{.*}} +dot4-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_dot5_insts {{.*}} +dot5-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_dot6_insts {{.*}} +dot6-insts is not supported on the current target. Deleting function body. +; WARN-GFX906: needs_dot8_insts {{.*}} +dot8-insts is not supported on the current target. Deleting function body. +; WARN-GFX906-NOT: not supported + +; WARN-GFX90A: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body. +; WARN-GFX90A: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body. +; WARN-GFX90A: needs_dot8_insts {{.*}} +dot8-insts is not supported on the current target. Deleting function body. +; WARN-GFX90A-NOT: not supported + +; WARN-GFX10: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body. +; WARN-GFX10: needs_dot3_insts {{.*}} +dot3-insts is not supported on the current target. Deleting function body. +; WARN-GFX10: needs_dot4_insts {{.*}} +dot4-insts is not supported on the current target. Deleting function body. +; WARN-GFX10: needs_dot8_insts {{.*}} +dot8-insts is not supported on the current target. Deleting function body. +; WARN-GFX10-NOT: not supported + +; WARN-GFX11: needs_dot1_insts {{.*}} +dot1-insts is not supported on the current target. Deleting function body. +; WARN-GFX11: needs_dot2_insts {{.*}} +dot2-insts is not supported on the current target. Deleting function body. +; WARN-GFX11: needs_dot3_insts {{.*}} +dot3-insts is not supported on the current target. Deleting function body. +; WARN-GFX11: needs_dot4_insts {{.*}} +dot4-insts is not supported on the current target. Deleting function body. +; WARN-GFX11: needs_dot6_insts {{.*}} +dot6-insts is not supported on the current target. Deleting function body. +; WARN-GFX11-NOT: not supported + +define void @needs_dpp(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #0 { +; GFX7-LABEL: needs_dpp: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dpp: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: ; %bb.1: ; %else +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: ; %bb.2: ; %Flow +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB0_4 +; GFX8-NEXT: ; %bb.3: ; %if +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3] +; GFX8-NEXT: .LBB0_4: ; %endif +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: needs_dpp: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_4 +; GFX9-NEXT: ; %bb.3: ; %if +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: .LBB0_4: ; %endif +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dpp: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB0_4 +; GFX10-NEXT: ; %bb.3: ; %if +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: .LBB0_4: ; %endif +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dpp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB0_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_16bit_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #1 { +; GFX7-LABEL: needs_16bit_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_16bit_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: ; %bb.1: ; %else +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: ; %bb.2: ; %Flow +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB1_4 +; GFX8-NEXT: ; %bb.3: ; %if +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3] +; GFX8-NEXT: .LBB1_4: ; %endif +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: needs_16bit_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_4 +; GFX9-NEXT: ; %bb.3: ; %if +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: .LBB1_4: ; %endif +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_16bit_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB1_4 +; GFX10-NEXT: ; %bb.3: ; %if +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: .LBB1_4: ; %endif +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_16bit_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB1_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_gfx8_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #2 { +; GFX7-LABEL: needs_gfx8_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_gfx8_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: ; %bb.1: ; %else +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: ; %bb.2: ; %Flow +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; %if +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3] +; GFX8-NEXT: .LBB2_4: ; %endif +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: needs_gfx8_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; %if +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: .LBB2_4: ; %endif +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_gfx8_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB2_4 +; GFX10-NEXT: ; %bb.3: ; %if +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: .LBB2_4: ; %endif +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_gfx8_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB2_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_gfx9_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #3 { +; GFX7-LABEL: needs_gfx9_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_gfx9_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_gfx9_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; %if +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: .LBB3_4: ; %endif +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_gfx9_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB3_4 +; GFX10-NEXT: ; %bb.3: ; %if +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: .LBB3_4: ; %endif +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_gfx9_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB3_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_gfx10_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #4 { +; GFX7-LABEL: needs_gfx10_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_gfx10_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_gfx10_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: needs_gfx10_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB4_4 +; GFX10-NEXT: ; %bb.3: ; %if +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX10-NEXT: .LBB4_4: ; %endif +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_gfx10_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB4_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_gfx11_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #5 { +; GFX7-LABEL: needs_gfx11_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_gfx11_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_gfx11_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: needs_gfx11_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: needs_gfx11_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_4 +; GFX11-NEXT: ; %bb.3: ; %if +; GFX11-NEXT: global_load_b64 v[8:9], v[2:3], off +; GFX11-NEXT: .LBB5_4: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[8:9], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot1_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #6 { +; GFX7-LABEL: needs_dot1_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot1_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_dot1_insts: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot1_insts: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dot1_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot2_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #7 { +; GFX7-LABEL: needs_dot2_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot2_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_dot2_insts: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot2_insts: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dot2_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot3_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #8 { +; GFX7-LABEL: needs_dot3_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot3_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX906-LABEL: needs_dot3_insts: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_endpgm +; +; GFX90A-LABEL: needs_dot3_insts: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot3_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: needs_dot3_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + + +define void @needs_dot4_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #9 { +; GFX7-LABEL: needs_dot4_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot4_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX906-LABEL: needs_dot4_insts: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_endpgm +; +; GFX90A-LABEL: needs_dot4_insts: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot4_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: needs_dot4_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot5_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #10 { +; GFX7-LABEL: needs_dot5_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot5_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX906-LABEL: needs_dot5_insts: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_endpgm +; +; GFX90A-LABEL: needs_dot5_insts: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot5_insts: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dot5_insts: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot6_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #11 { +; GFX7-LABEL: needs_dot6_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot6_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX906-LABEL: needs_dot6_insts: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_endpgm +; +; GFX90A-LABEL: needs_dot6_insts: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot6_insts: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dot6_insts: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot7_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #12 { +; GFX7-LABEL: needs_dot7_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot7_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_dot7_insts: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: needs_dot7_insts: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: needs_dot7_insts: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define void @needs_dot8_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #13 { +; GFX7-LABEL: needs_dot8_insts: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: needs_dot8_insts: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: needs_dot8_insts: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: needs_dot8_insts: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: needs_dot8_insts: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { "target-features"="+dpp" } +attributes #1 = { "target-features"="+16-bit-insts" } +attributes #2 = { "target-features"="+gfx8-insts" } +attributes #3 = { "target-features"="+gfx9-insts" } +attributes #4 = { "target-features"="+gfx10-insts" } +attributes #5 = { "target-features"="+gfx11-insts" } +attributes #6 = { "target-features"="+dot1-insts" } +attributes #7 = { "target-features"="+dot2-insts" } +attributes #8 = { "target-features"="+dot3-insts" } +attributes #9 = { "target-features"="+dot4-insts" } +attributes #10 = { "target-features"="+dot5-insts" } +attributes #11 = { "target-features"="+dot6-insts" } +attributes #12 = { "target-features"="+dot7-insts" } +attributes #13 = { "target-features"="+dot8-insts" } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -60,6 +60,7 @@ ; GCN-O0-NEXT: Lower SwitchInst's to branches ; GCN-O0-NEXT: Lower invoke and unwind, for unwindless code generators ; GCN-O0-NEXT: Remove unreachable blocks from the CFG +; GCN-O0-NEXT: AMDGPU Clear Incompatible Functions Bodies ; GCN-O0-NEXT: Post-Dominator Tree Construction ; GCN-O0-NEXT: Dominator Tree Construction ; GCN-O0-NEXT: Natural Loop Information @@ -240,6 +241,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Flatten the CFG +; GCN-O1-NEXT: AMDGPU Clear Incompatible Functions Bodies ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Post-Dominator Tree Construction ; GCN-O1-NEXT: Natural Loop Information @@ -528,6 +530,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Flatten the CFG +; GCN-O1-OPTS-NEXT: AMDGPU Clear Incompatible Functions Bodies ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Natural Loop Information @@ -824,6 +827,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Flatten the CFG +; GCN-O2-NEXT: AMDGPU Clear Incompatible Functions Bodies ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Post-Dominator Tree Construction ; GCN-O2-NEXT: Natural Loop Information @@ -1135,6 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Flatten the CFG +; GCN-O3-NEXT: AMDGPU Clear Incompatible Functions Bodies ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Post-Dominator Tree Construction ; GCN-O3-NEXT: Natural Loop Information