diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,18 +14,29 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "llvm-c/Core.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -195,6 +206,16 @@
 
   bool canWidenScalarExtLoad(LoadInst &I) const;
 
+  /// Fold negate a builtin intrinsic into a test mask
+  ///
+  /// \details \p Perform the following transformation from
+  //  xor (llvm.amdgcn.class x, mask), -1 into
+  //  llvm.amdgcn.class(x, ~mask).
+  //
+  /// \returns True if at least one transformation was performed,
+  //  false otherwise.
+  bool foldNegateIntrinsic(Function &F);
+
 public:
   static char ID;
 
@@ -1363,6 +1384,8 @@
   if (skipFunction(F))
     return false;
 
+  bool MadeChange = foldNegateIntrinsic(F);
+
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;
@@ -1380,8 +1403,6 @@
   AMDGPU::SIModeRegisterDefaults Mode(F);
   HasFP32Denormals = Mode.allFP32Denormals();
 
-  bool MadeChange = false;
-
   Function::iterator NextBB;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
     BasicBlock *BB = &*FI;
@@ -1407,6 +1428,64 @@
   return MadeChange;
 }
 
+bool AMDGPUCodeGenPrepare::foldNegateIntrinsic(Function &F) {
+  SmallVector<Instruction *, 8> DeadInstr;
+  for (auto BB = F.begin(); BB != F.end(); ++BB) {
+    for (auto Inst = BB->begin(); Inst != BB->end(); ++Inst) {
+      if (Inst->getOpcode() != Instruction::Xor)
+        continue;
+
+      Value *NegOne = ConstantInt::get(Inst->getOperand(0)->getType(), -1);
+
+      // Match call instruction and constant -1
+      Value *ExtCall;
+      if (Inst->getOperand(1) == NegOne)
+        ExtCall = Inst->getOperand(0);
+      else if (Inst->getOperand(0) == NegOne)
+        ExtCall = Inst->getOperand(1);
+      else
+        continue;
+
+      // Check if either the parent or the grandparent of the other
+      // operand is a function call
+      CallInst *IntrinsicCall;
+      if (isa<TruncInst>(ExtCall))
+        IntrinsicCall =
+            dyn_cast<CallInst>(cast<TruncInst>(ExtCall)->getOperand(0));
+      else if (isa<SExtInst>(ExtCall))
+        IntrinsicCall =
+            dyn_cast<CallInst>(cast<SExtInst>(ExtCall)->getOperand(0));
+      else if (isa<ZExtInst>(ExtCall))
+        IntrinsicCall =
+            dyn_cast<CallInst>(cast<ZExtInst>(ExtCall)->getOperand(0));
+      else if (isa<CallInst>(ExtCall)) {
+        IntrinsicCall = cast<CallInst>(ExtCall);
+      } else
+        IntrinsicCall = nullptr;
+
+      // Now check if the CallInst calls llvm.amdgcn.class
+      if (!IntrinsicCall || (IntrinsicCall && IntrinsicCall->getIntrinsicID() !=
+                                                  Intrinsic::amdgcn_class))
+        continue;
+
+      // "Not" the second argument of the intrinsic call
+      IRBuilder<> Builder(IntrinsicCall);
+      IntrinsicCall->setArgOperand(
+          1, Builder.CreateNot(IntrinsicCall->getOperand(1)));
+      if (isa<CallInst>(ExtCall))
+        Inst->replaceAllUsesWith(IntrinsicCall);
+      else
+        Inst->replaceAllUsesWith(ExtCall);
+      DeadInstr.push_back(cast<Instruction>(Inst));
+    }
+  }
+
+  // remove all dead instructions
+  for (auto *Inst : DeadInstr)
+    Inst->eraseFromParent();
+  return !DeadInstr.empty();
+}
+
 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -amdgpu-codegenprepare -verify %s -o - | FileCheck %s
+
+declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone
+declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
+
+; Trivial case, xor instruction should be removed and
+; the second argument of the intrinsic call should be
+; bitwise-negated
+; CHECK: @fold_negate_intrinsic_test_mask
+; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6)
+; CHECK-NOT: xor
+define i1 @fold_negate_intrinsic_test_mask(float %x) nounwind {
+  %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5)
+  %2 = xor i1 %1, -1
+  ret i1 %2
+}
+
+; CHECK: @fold_negate_intrinsic_test_mask_zext
+; CHECK: %[[X:.*]] = call i1 @llvm.amdgcn.class.f32(float %x, i32 -6)
+; CHECK: %[[X1:.*]] = zext i1 %[[X]] to i32
+; CHECK-NOT: xor i32 %[[X1]], -1
+; CHECK: xor i32 %{{.*}}, 42
+define i32 @fold_negate_intrinsic_test_mask_zext(float %x) nounwind {
+  %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5)
+  %2 = zext i1 %1 to i32
+  %3 = xor i32 %2, -1
+  %4 = xor i32 %3, 42
+  ret i32 %3
+}
+
+; CHECK: @fold_negate_intrinsic_test_mask_sext
+; CHECK: %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5)
+; CHECK: sext
+define i32 @fold_negate_intrinsic_test_mask_sext(float %x) nounwind {
+  %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 5)
+  %2 = sext i1 %1 to i32
+  ret i32 %2
+}