diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,18 +14,28 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" +#include "llvm-c/Core.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -201,6 +211,7 @@ AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); + bool visitXor(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); @@ -808,6 +819,41 @@ return !!NewFDiv; } +bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) { + // Match the Xor instruction, its type and its operands + IntrinsicInst *IntrinsicCall = nullptr; + if (!I.hasOneUse()) + return false; + else { + auto *LHS = I.getOperand(0); + auto *RHS = I.getOperand(1); + + if (isa(LHS) && isa(RHS) && + cast(RHS)->getSExtValue() == -1) + IntrinsicCall = dyn_cast(LHS); + else + return false; + } + + // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic + if (!IntrinsicCall || + IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class) + return false; + + // "Not" the second argument of the intrinsic call + IRBuilder<> Builder(IntrinsicCall); + ConstantInt *Arg = dyn_cast(IntrinsicCall->getOperand(1)); + if (!Arg) + return false; + + IntrinsicCall->setOperand( + 1, ConstantInt::get(IntrinsicCall->getOperand(1)->getType(), + ~Arg->getZExtValue() & ((1ull << 32) - 1))); + I.replaceAllUsesWith(IntrinsicCall); + I.eraseFromParent(); + return true; +} + static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); return Attr.getValueAsBool(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -amdgpu-codegenprepare -verify -S %s -o - | FileCheck %s + +declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone +declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone + +; Trivial case, xor instruction should be removed and +; the second argument of the intrinsic call should be +; bitwise-negated +; CHECK: @fold_negate_intrinsic_test_mask +; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6) +define i1 @fold_negate_intrinsic_test_mask(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) + %2 = xor i1 %1, -1 + ret i1 %2 +} + +; Trivial case, xor instruction should be removed and +; the second argument of the intrinsic call should be +; bitwise-negated +; CHECK: @fold_negate_intrinsic_test_mask_dbl +; CHECK: %1 = call i1 @llvm.amdgcn.class.f64(double %x, i32 -6) +define i1 @fold_negate_intrinsic_test_mask_dbl(double %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f64(double %x, i32 5) + %2 = xor i1 %1, -1 + ret i1 %2 +} + +; Negative test: should not transform for variable test masks +; CHECK: @fold_negate_intrinsic_test_mask_neg_var +; CHECK: %[[X0:.*]] = alloca i32 +; CHECK: %[[X1:.*]] = load i32, i32* %[[X0]] +; CHECK: call i1 @llvm.amdgcn.class.f32(float %x, i32 %[[X1]]) +; CHECK: xor +define i1 @fold_negate_intrinsic_test_mask_neg_var(float %x) nounwind { + %1 = alloca i32 + store i32 7, i32* %1 + %2 = load i32, i32* %1 + %3 = call i1 @llvm.amdgcn.class.f32(float %x, i32 %2) + %4 = xor i1 %3, -1 + ret i1 %4 +} + +; Negative test: should not transform for multiple uses of the +; intrinsic returned value +; CHECK: @fold_negate_intrinsic_test_mask_neg_multiple_uses +; CHECK: %[[X1:.*]] = call i1 @llvm.amdgcn.class.f32 +; CHECK: store i1 %[[X1]] +; CHECK: %[[X2:.*]] = xor i1 %[[X1]] +define i1 @fold_negate_intrinsic_test_mask_neg_multiple_uses(float %x) nounwind { + %y = alloca i1 + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7) + %2 = xor i1 %1, -1 + store i1 %1, i1* %y + %3 = xor i1 %1, -1 + ret i1 %2 +} + +; Negative test: should not transform for a xor with no operand equal to -1 +; CHECK: @fold_negate_intrinsic_test_mask_neg_one +; CHECK: %[[X0:.*]] = call i1 @llvm.amdgcn.class.f32 +; CHECK: xor i1 %[[X0]], false +define i1 @fold_negate_intrinsic_test_mask_neg_one(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7) + %2 = xor i1 %1, false + ret i1 %2 +}