diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,18 +14,29 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" +#include "llvm-c/Core.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -195,6 +206,16 @@ bool canWidenScalarExtLoad(LoadInst &I) const; + /// Fold negate a builtin intrinsic into a test mask + /// + /// \details \p Perform the following transformation from + // xor (llvm.amdgcn.class x, mask), -1 into + // llvm.amdgcn.class(x, ~mask). + // + /// \returns True if at least one transformation was performed, + // false otherwise. + bool foldNegateIntrinsic(Function &F); + public: static char ID; @@ -1363,6 +1384,8 @@ if (skipFunction(F)) return false; + bool MadeChange = foldNegateIntrinsic(F); + auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; @@ -1380,8 +1403,6 @@ AMDGPU::SIModeRegisterDefaults Mode(F); HasFP32Denormals = Mode.allFP32Denormals(); - bool MadeChange = false; - Function::iterator NextBB; for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { BasicBlock *BB = &*FI; @@ -1407,6 +1428,64 @@ return MadeChange; } +bool AMDGPUCodeGenPrepare::foldNegateIntrinsic(Function &F) { + SmallVector DeadInstr; + for (auto BB = F.begin(); BB != F.end(); ++BB) { + for (auto Inst = BB->begin(); Inst != BB->end(); ++Inst) { + if (Inst->getOpcode() != Instruction::Xor) + continue; + + Value *NegOne = ConstantInt::get(Inst->getOperand(0)->getType(), -1); + + // Match call instruction and constant -1 + Value *ExtCall; + if (Inst->getOperand(1) == NegOne) + ExtCall = Inst->getOperand(0); + else if (Inst->getOperand(0) == NegOne) + ExtCall = Inst->getOperand(1); + else + continue; + + // Check if either the parent or the grandparent of the other + // operand is a function call + CallInst *IntrinsicCall; + if (isa(ExtCall)) + IntrinsicCall = + dyn_cast(cast(ExtCall)->getOperand(0)); + else if (isa(ExtCall)) + IntrinsicCall = + dyn_cast(cast(ExtCall)->getOperand(0)); + else if (isa(ExtCall)) + IntrinsicCall = + dyn_cast(cast(ExtCall)->getOperand(0)); + else if (isa(ExtCall)) { + IntrinsicCall = cast(ExtCall); + } else + IntrinsicCall = nullptr; + + // Now check if the CallInst calls llvm.amdgcn.class + if (!IntrinsicCall || (IntrinsicCall && IntrinsicCall->getIntrinsicID() != + Intrinsic::amdgcn_class)) + continue; + + // "Not" the second argument of the intrinsic call + IRBuilder<> Builder(IntrinsicCall); + IntrinsicCall->setArgOperand( + 1, Builder.CreateNot(IntrinsicCall->getOperand(1))); + if (isa(ExtCall)) + Inst->replaceAllUsesWith(IntrinsicCall); + else + Inst->replaceAllUsesWith(ExtCall); + DeadInstr.push_back(cast(Inst)); + } + } + + // remove all dead instructions + for (auto *Inst : DeadInstr) + Inst->eraseFromParent(); + return !DeadInstr.empty(); +} + INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -amdgpu-codegenprepare -verify %s -o - | FileCheck %s + +declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone +declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone + +; Trivial case, xor instruction should be removed and +; the second argument of the intrinsic call should be +; bitwise-negated +; CHECK: @fold_negate_intrinsic_test_mask +; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6) +; CHECK-NOT: xor +define i1 @fold_negate_intrinsic_test_mask(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) + %2 = xor i1 %1, -1 + ret i1 %2 +} + +; CHECK: @fold_negate_intrinsic_test_mask_zext +; CHECK: %[[X:.*]] = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6) +; CHECK: %[[X1:.*]] = zext i1 %[[X]] to i32 +; CHECK-NOT: xor i32 %[[X1]], -1 +; CHECK: xor i32 %{{.*}}, 42 +define i32 @fold_negate_intrinsic_test_mask_zext(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) + %2 = zext i1 %1 to i32 + %3 = xor i32 %2, -1 + %4 = xor i32 %3, 42 + ret i32 %3 +} + +; CHECK: @fold_negate_intrinsic_test_mask_sext +; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) +; CHECK: sext +define i32 @fold_negate_intrinsic_test_mask_sext(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) + %2 = sext i1 %1 to i32 + ret i32 %2 +}