diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -20,11 +20,13 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/IntegerDivision.h" @@ -201,6 +203,7 @@ AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); + bool visitXor(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); @@ -808,6 +811,32 @@ return !!NewFDiv; } +bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) { + // Match the Xor instruction, its type and its operands + IntrinsicInst *IntrinsicCall = dyn_cast(I.getOperand(0)); + ConstantInt *RHS = dyn_cast(I.getOperand(1)); + if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1) + return false; + + // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic + // has only one use + if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class || + !IntrinsicCall->hasOneUse()) + return false; + + // "Not" the second argument of the intrinsic call + IRBuilder<> Builder(IntrinsicCall); + ConstantInt *Arg = dyn_cast(IntrinsicCall->getOperand(1)); + if (!Arg) + return false; + int Mask = (1ull << (64 - __builtin_clz(Arg->getZExtValue()))) - 1; + IntrinsicCall->setOperand( + 1, ConstantInt::get(Arg->getType(), ~Arg->getZExtValue() & Mask)); + I.replaceAllUsesWith(IntrinsicCall); + I.eraseFromParent(); + return true; +} + static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); return Attr.getValueAsBool(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll @@ -0,0 +1,66 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -amdgpu-codegenprepare -verify -S %s -o - | FileCheck %s + +declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone +declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone + +; Trivial case, xor instruction should be removed and +; the second argument of the intrinsic call should be +; bitwise-negated +; CHECK: @fold_negate_intrinsic_test_mask +; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6) +define i1 @fold_negate_intrinsic_test_mask(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5) + %2 = xor i1 %1, -1 + ret i1 %2 +} + +; Trivial case, xor instruction should be removed and +; the second argument of the intrinsic call should be +; bitwise-negated +; CHECK: @fold_negate_intrinsic_test_mask_dbl +; CHECK: %1 = call i1 @llvm.amdgcn.class.f64(double %x, i32 -6) +define i1 @fold_negate_intrinsic_test_mask_dbl(double %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f64(double %x, i32 5) + %2 = xor i1 %1, -1 + ret i1 %2 +} + +; Negative test: should not transform for variable test masks +; CHECK: @fold_negate_intrinsic_test_mask_neg_var +; CHECK: %[[X0:.*]] = alloca i32 +; CHECK: %[[X1:.*]] = load i32, i32* %[[X0]] +; CHECK: call i1 @llvm.amdgcn.class.f32(float %x, i32 %[[X1]]) +; CHECK: xor +define i1 @fold_negate_intrinsic_test_mask_neg_var(float %x) nounwind { + %1 = alloca i32 + store i32 7, i32* %1 + %2 = load i32, i32* %1 + %3 = call i1 @llvm.amdgcn.class.f32(float %x, i32 %2) + %4 = xor i1 %3, -1 + ret i1 %4 +} + +; Negative test: should not transform for multiple uses of the +; intrinsic returned value +; CHECK: @fold_negate_intrinsic_test_mask_neg_multiple_uses +; CHECK: %[[X1:.*]] = call i1 @llvm.amdgcn.class.f32 +; CHECK: store i1 %[[X1]] +; CHECK: %[[X2:.*]] = xor i1 %[[X1]] +define i1 @fold_negate_intrinsic_test_mask_neg_multiple_uses(float %x) nounwind { + %y = alloca i1 + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7) + %2 = xor i1 %1, -1 + store i1 %1, i1* %y + %3 = xor i1 %1, -1 + ret i1 %2 +} + +; Negative test: should not transform for a xor with no operand equal to -1 +; CHECK: @fold_negate_intrinsic_test_mask_neg_one +; CHECK: %[[X0:.*]] = call i1 @llvm.amdgcn.class.f32 +; CHECK: xor i1 %[[X0]], false +define i1 @fold_negate_intrinsic_test_mask_neg_one(float %x) nounwind { + %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7) + %2 = xor i1 %1, false + ret i1 %2 +}