Index: llvm/include/llvm/CodeGen/AtomicExpandUtils.h
===================================================================
--- llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -22,7 +22,7 @@
 /// (the builder, %addr, %loaded, %new_val, ordering,
 ///  /* OUT */ %success, /* OUT */ %new_loaded)
 using CreateCmpXchgInstFun =
-    function_ref<void(IRBuilder<> &, Value *, Value *, Value *, Align,
+    function_ref<void(IRBuilderBase &, Value *, Value *, Value *, Align,
                       AtomicOrdering, SyncScope::ID, Value *&, Value *&)>;
 
 /// Expand an atomic RMW instruction into a loop utilizing
Index: llvm/lib/CodeGen/AtomicExpandPass.cpp
===================================================================
--- llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -60,6 +61,7 @@
 
 class AtomicExpand : public FunctionPass {
   const TargetLowering *TLI = nullptr;
+  const DataLayout *DL = nullptr;
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -83,13 +85,13 @@
   bool tryExpandAtomicRMW(AtomicRMWInst *AI);
   AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
   Value *
-  insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+  insertRMWLLSCLoop(IRBuilderBase &Builder, Type *ResultTy, Value *Addr,
                     Align AddrAlign, AtomicOrdering MemOpOrder,
-                    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
-  void
-  expandAtomicOpToLLSC(Instruction *I, Type *ResultTy, Value *Addr,
-                       Align AddrAlign, AtomicOrdering MemOpOrder,
-                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+                    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
+  void expandAtomicOpToLLSC(
+      Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,
+      AtomicOrdering MemOpOrder,
+      function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
   void expandPartwordAtomicRMW(
       AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
   AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
@@ -98,12 +100,11 @@
   void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
 
   AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
-  static Value *
-  insertRMWCmpXchgLoop(IRBuilder<> &Builder, Type *ResultType, Value *Addr,
-                       Align AddrAlign, AtomicOrdering MemOpOrder,
-                       SyncScope::ID SSID,
-                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
-                       CreateCmpXchgInstFun CreateCmpXchg);
+  static Value *insertRMWCmpXchgLoop(
+      IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
+      AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+      function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
+      CreateCmpXchgInstFun CreateCmpXchg);
   bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
 
   bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
@@ -126,9 +127,11 @@
 };
 
 // IRBuilder to be used for replacement atomic instructions.
-struct ReplacementIRBuilder : IRBuilder<> {
+struct ReplacementIRBuilder : IRBuilder<InstSimplifyFolder> {
   // Preserves the DebugLoc from I, and preserves still valid metadata.
-  explicit ReplacementIRBuilder(Instruction *I) : IRBuilder<>(I) {
+  explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL)
+      : IRBuilder(I->getContext(), DL) {
+    SetInsertPoint(I);
     this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections});
   }
 };
@@ -182,9 +185,11 @@
     return false;
 
   auto &TM = TPC->getTM<TargetMachine>();
-  if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
+  const auto *Subtarget = TM.getSubtargetImpl(F);
+  if (!Subtarget->enableAtomicExpand())
     return false;
-  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
+  TLI = Subtarget->getTargetLowering();
+  DL = &F.getParent()->getDataLayout();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
@@ -318,7 +323,7 @@
 }
 
 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
-  ReplacementIRBuilder Builder(I);
+  ReplacementIRBuilder Builder(I, *DL);
 
   auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
 
@@ -347,7 +352,7 @@
   auto *M = LI->getModule();
   Type *NewTy = getCorrespondingIntegerType(LI->getType(), M->getDataLayout());
 
-  ReplacementIRBuilder Builder(LI);
+  ReplacementIRBuilder Builder(LI, *DL);
 
   Value *Addr = LI->getPointerOperand();
   Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
@@ -371,7 +376,7 @@
   Type *NewTy =
       getCorrespondingIntegerType(RMWI->getType(), M->getDataLayout());
 
-  ReplacementIRBuilder Builder(RMWI);
+  ReplacementIRBuilder Builder(RMWI, *DL);
 
   Value *Addr = RMWI->getPointerOperand();
   Value *Val = RMWI->getValOperand();
@@ -403,7 +408,7 @@
     expandAtomicOpToLLSC(
         LI, LI->getType(), LI->getPointerOperand(), LI->getAlign(),
         LI->getOrdering(),
-        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+        [](IRBuilderBase &Builder, Value *Loaded) { return Loaded; });
     return true;
   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
     return expandAtomicLoadToLL(LI);
@@ -433,7 +438,7 @@
 }
 
 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
-  ReplacementIRBuilder Builder(LI);
+  ReplacementIRBuilder Builder(LI, *DL);
 
   // On some architectures, load-linked instructions are atomic for larger
   // sizes than normal loads. For example, the only 64-bit load guaranteed
@@ -449,7 +454,7 @@
 }
 
 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
-  ReplacementIRBuilder Builder(LI);
+  ReplacementIRBuilder Builder(LI, *DL);
   AtomicOrdering Order = LI->getOrdering();
   if (Order == AtomicOrdering::Unordered)
     Order = AtomicOrdering::Monotonic;
@@ -478,7 +483,7 @@
 /// mechanism, we convert back to the old format which the backends understand.
 /// Each backend will need individual work to recognize the new format.
 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
-  ReplacementIRBuilder Builder(SI);
+  ReplacementIRBuilder Builder(SI, *DL);
   auto *M = SI->getModule();
   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
                                             M->getDataLayout());
@@ -504,7 +509,7 @@
   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
   // It is the responsibility of the target to only signal expansion via
   // shouldExpandAtomicRMW in cases where this is required and possible.
-  ReplacementIRBuilder Builder(SI);
+  ReplacementIRBuilder Builder(SI, *DL);
   AtomicOrdering Ordering = SI->getOrdering();
   assert(Ordering != AtomicOrdering::NotAtomic);
   AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
@@ -519,7 +524,7 @@
   tryExpandAtomicRMW(AI);
 }
 
-static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
+static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
                                  Value *Loaded, Value *NewVal, Align AddrAlign,
                                  AtomicOrdering MemOpOrder, SyncScope::ID SSID,
                                  Value *&Success, Value *&NewLoaded) {
@@ -559,7 +564,7 @@
       expandPartwordAtomicRMW(AI,
                               TargetLoweringBase::AtomicExpansionKind::LLSC);
     } else {
-      auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+      auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
         return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
                                    AI->getValOperand());
       };
@@ -669,9 +674,9 @@
 ///       include only the part that would've been loaded from Addr.
 ///
 /// Inv_Mask: The inverse of Mask.
-static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
-                                           Type *ValueType, Value *Addr,
-                                           Align AddrAlign,
+static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
+                                           Instruction *I, Type *ValueType,
+                                           Value *Addr, Align AddrAlign,
                                            unsigned MinWordSize) {
   PartwordMaskValues PMV;
 
@@ -736,7 +741,7 @@
   return PMV;
 }
 
-static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+static Value *extractMaskedValue(IRBuilderBase &Builder, Value *WideWord,
                                  const PartwordMaskValues &PMV) {
   assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
   if (PMV.WordType == PMV.ValueType)
@@ -747,7 +752,7 @@
   return Trunc;
 }
 
-static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+static Value *insertMaskedValue(IRBuilderBase &Builder, Value *WideWord,
                                 Value *Updated, const PartwordMaskValues &PMV) {
   assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
   assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
@@ -766,7 +771,7 @@
 /// operation. (That is, only the bits under the Mask should be
 /// affected by the operation)
 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
-                                    IRBuilder<> &Builder, Value *Loaded,
+                                    IRBuilderBase &Builder, Value *Loaded,
                                     Value *Shifted_Inc, Value *Inc,
                                     const PartwordMaskValues &PMV) {
   // TODO: update to use
@@ -821,7 +826,7 @@
   AtomicOrdering MemOpOrder = AI->getOrdering();
   SyncScope::ID SSID = AI->getSyncScopeID();
 
-  ReplacementIRBuilder Builder(AI);
+  ReplacementIRBuilder Builder(AI, *DL);
 
   PartwordMaskValues PMV =
       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
@@ -831,7 +836,7 @@
       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
                         PMV.ShiftAmt, "ValOperand_Shifted");
 
-  auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+  auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
     return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
                                  ValOperand_Shifted, AI->getValOperand(), PMV);
   };
@@ -855,7 +860,7 @@
 
 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
 AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
-  ReplacementIRBuilder Builder(AI);
+  ReplacementIRBuilder Builder(AI, *DL);
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
   assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
@@ -930,7 +935,7 @@
 
   BasicBlock *BB = CI->getParent();
   Function *F = BB->getParent();
-  ReplacementIRBuilder Builder(CI);
+  ReplacementIRBuilder Builder(CI, *DL);
   LLVMContext &Ctx = Builder.getContext();
 
   BasicBlock *EndBB =
@@ -1016,8 +1021,8 @@
 void AtomicExpand::expandAtomicOpToLLSC(
     Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder,
-    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
-  ReplacementIRBuilder Builder(I);
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
+  ReplacementIRBuilder Builder(I, *DL);
   Value *Loaded = insertRMWLLSCLoop(Builder, ResultType, Addr, AddrAlign,
                                     MemOpOrder, PerformOp);
 
@@ -1026,7 +1031,7 @@
 }
 
 void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
-  ReplacementIRBuilder Builder(AI);
+  ReplacementIRBuilder Builder(AI, *DL);
 
   PartwordMaskValues PMV =
       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
@@ -1052,7 +1057,7 @@
 }
 
 void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
-  ReplacementIRBuilder Builder(CI);
+  ReplacementIRBuilder Builder(CI, *DL);
 
   PartwordMaskValues PMV = createMaskInstrs(
       Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(),
@@ -1079,9 +1084,9 @@
 }
 
 Value *AtomicExpand::insertRMWLLSCLoop(
-    IRBuilder<> &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
+    IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder,
-    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
   LLVMContext &Ctx = Builder.getContext();
   BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
@@ -1139,7 +1144,7 @@
   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
                                             M->getDataLayout());
 
-  ReplacementIRBuilder Builder(CI);
+  ReplacementIRBuilder Builder(CI, *DL);
 
   Value *Addr = CI->getPointerOperand();
   Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
@@ -1263,7 +1268,7 @@
       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
 
-  ReplacementIRBuilder Builder(CI);
+  ReplacementIRBuilder Builder(CI, *DL);
 
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place), but we might want a fence too. It's easiest to just remove
@@ -1443,9 +1448,9 @@
 }
 
 Value *AtomicExpand::insertRMWCmpXchgLoop(
-    IRBuilder<> &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
+    IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder, SyncScope::ID SSID,
-    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
     CreateCmpXchgInstFun CreateCmpXchg) {
   LLVMContext &Ctx = Builder.getContext();
   BasicBlock *BB = Builder.GetInsertBlock();
@@ -1528,11 +1533,11 @@
 // Note: This function is exposed externally by AtomicExpandUtils.h
 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
                                     CreateCmpXchgInstFun CreateCmpXchg) {
-  ReplacementIRBuilder Builder(AI);
+  ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout());
   Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
       Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
       AI->getOrdering(), AI->getSyncScopeID(),
-      [&](IRBuilder<> &Builder, Value *Loaded) {
+      [&](IRBuilderBase &Builder, Value *Loaded) {
         return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
                                    AI->getValOperand());
       },
@@ -1682,7 +1687,7 @@
   // CAS libcall, via a CAS loop, instead.
   if (!Success) {
     expandAtomicRMWToCmpXchg(
-        I, [this](IRBuilder<> &Builder, Value *Addr, Value *Loaded,
+        I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
                   Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
                   SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
           // Create the CAS instruction normally...
Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
===================================================================
--- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -39,20 +39,18 @@
 ; CHECK-LABEL: @test_atomicrmw_xchg_i16_global_align4(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = bitcast i16 addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)*
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[LOADED]], -65536
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP4]] seq_cst seq_cst, align 4
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], 0
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
   %res = atomicrmw xchg i16 addrspace(1)* %ptr, i16 %value seq_cst, align 4
@@ -96,12 +94,11 @@
 ; CHECK-LABEL: @test_atomicrmw_add_i16_global_align4(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = bitcast i16 addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)*
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[NEW]], 65535
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[LOADED]], -65536
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
@@ -110,8 +107,7 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], 0
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
   %res = atomicrmw add i16 addrspace(1)* %ptr, i16 %value seq_cst, align 4
@@ -446,30 +442,27 @@
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, i16 addrspace(1)* [[OUT:%.*]], i64 4
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = bitcast i16 addrspace(1)* [[GEP]] to i32 addrspace(1)*
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[OLD:%.*]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], -65536
 ; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
 ; CHECK:       partword.cmpxchg.loop:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], [[TMP0:%.*]] ], [ [[TMP13:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[TMP9]], i32 [[TMP8]] seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i32, i1 } [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { i32, i1 } [[TMP10]], 1
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[TMP11:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg i32 addrspace(1)* [[ALIGNEDADDR]], i32 [[TMP7]], i32 [[TMP6]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
 ; CHECK:       partword.cmpxchg.failure:
-; CHECK-NEXT:    [[TMP13]] = and i32 [[TMP11]], -65536
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP7]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; CHECK-NEXT:    [[TMP11]] = and i32 [[TMP9]], -65536
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
 ; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP11]], 0
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { i16, i1 } undef, i16 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i16, i1 } [[TMP15]], i1 [[TMP12]], 1
-; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i16, i1 } [[TMP16]], 0
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP9]] to i16
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { i16, i1 } undef, i16 [[EXTRACTED]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { i16, i1 } [[TMP13]], i1 [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i16, i1 } [[TMP14]], 0
 ; CHECK-NEXT:    ret i16 [[EXTRACT]]
 ;
   %gep = getelementptr i16, i16 addrspace(1)* %out, i64 4
@@ -555,10 +548,8 @@
 ; CHECK-LABEL: @test_atomicrmw_xor_i16_local_align4(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = bitcast i16 addrspace(3)* [[PTR:%.*]] to i32 addrspace(3)*
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor i32 addrspace(3)* [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP2]], 0
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor i32 addrspace(3)* [[ALIGNEDADDR]], i32 [[TMP1]] seq_cst, align 4
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
   %res = atomicrmw xor i16 addrspace(3)* %ptr, i16 %value seq_cst, align 4
Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
===================================================================
--- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -103,12 +103,11 @@
 ; CHECK-LABEL: @test_atomicrmw_add_i8_global_align4(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = bitcast i8 addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)*
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(1)* [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[NEW]], 255
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[LOADED]], -256
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
@@ -117,8 +116,7 @@
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], 0
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add i8 addrspace(1)* %ptr, i8 %value seq_cst, align 4