diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14873,6 +14873,10 @@
     }
     LLVM_FALLTHROUGH;
   }
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane2);
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane2);
   default:
     return nullptr;
   }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -291,15 +291,15 @@
   *out = __builtin_amdgcn_ds_bpermute(a, b);
 }
 
-// CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK-LABEL: @test_readfirstlane(
+// CHECK: call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane2.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1415,23 +1415,38 @@
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
-def int_amdgcn_readfirstlane :
-  GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+def int_amdgcn_readfirstlane2 :
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
-def int_amdgcn_readlane :
-  GCCBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_readlane2 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>,  // data input
+             llvm_i32_ty],      // uniform lane select
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The value to write and lane select arguments must be uniform across the
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
+def int_amdgcn_writelane2 :
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,  // uniform value to write: returned by the selected lane
+    llvm_i32_ty,       // uniform lane select
+    LLVMMatchType<0>   // returned by all lanes other than the selected one
+  ],
+  [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+// Non-overloaded versions of readfirstlane2 / readlane2 / writelane2.
+def int_amdgcn_readfirstlane :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
+def int_amdgcn_readlane :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 def int_amdgcn_writelane :
-  GCCBuiltin<"__builtin_amdgcn_writelane">,
   Intrinsic<[llvm_i32_ty], [
     llvm_i32_ty,    // uniform value to write: returned by the selected lane
     llvm_i32_ty,    // uniform lane select
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -285,11 +285,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *PermLaneX16 =
       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty});
 
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
@@ -344,11 +344,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty});
   Function *WriteLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane2, {Ty});
 
   if (ST->hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
@@ -490,25 +490,8 @@
     // each active lane in the wavefront. This will be our new value which we
     // will provide to the atomic operation.
     Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
-      CallInst *const ReadLaneLo = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
-      CallInst *const ReadLaneHi = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
-      NewV = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane2, {Ty},
+                             {NewV, LastLaneIdx});
 
     // Finally mark the readlanes in the WWM section.
     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
@@ -587,27 +570,8 @@
     // We need to broadcast the value who was the lowest active lane (the first
     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-    Value *BroadcastI = nullptr;
-
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-      BroadcastI = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-
-      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    Value *BroadcastI =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2, {Ty}, {PHI});
 
     // Now that we have the result of our single atomic operation, we need to
     // get our individual lane's slice into the result. We use the lane offset
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -79,7 +79,7 @@
     const SIRegisterInfo *TRI
       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
     if (TRI->isSGPRReg(MRI, PhysReg)) {
-      auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+      auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane2,
                                               {MRI.getType(ExtReg)}, false)
         .addReg(ExtReg);
       ExtReg = ToSGPR.getReg(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -224,6 +224,10 @@
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+  bool visitLaneIntrinsicInst(IntrinsicInst &I, Intrinsic::ID CanonicalIID);
+  Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+                                 Value *Data0, Value *Lane = nullptr,
+                                 Value *Data1 = nullptr);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -1344,6 +1348,16 @@
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse:
     return visitBitreverseIntrinsicInst(I);
+  case Intrinsic::amdgcn_readfirstlane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readfirstlane2);
+  case Intrinsic::amdgcn_readlane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readlane2);
+  case Intrinsic::amdgcn_writelane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_writelane2);
+  case Intrinsic::amdgcn_readfirstlane2:
+  case Intrinsic::amdgcn_readlane2:
+  case Intrinsic::amdgcn_writelane2:
+    return visitLaneIntrinsicInst(I, I.getIntrinsicID());
   default:
     return false;
   }
@@ -1359,6 +1373,140 @@
   return Changed;
 }
 
+Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B,
+                                                     Intrinsic::ID IID,
+                                                     Value *Data0, Value *Lane,
+                                                     Value *Data1) {
+  Type *Ty = Data0->getType();
+
+  if (Ty == B.getInt32Ty()) {
+    Value *Args[3] = {Data0, Lane, Data1};
+    unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1;
+    return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs});
+  }
+
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    Type *EltType = VecTy->getElementType();
+    bool is16Bit =
+        (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) ||
+        (EltType->isHalfTy());
+    int EC = VecTy->getElementCount().Min;
+
+    Value *Result = UndefValue::get(Ty);
+    for (int i = 0; i < EC; i += 1 + is16Bit) {
+      Value *EltData0;
+      Value *EltData1 = nullptr;
+
+      if (is16Bit) {
+        int Idxs[2] = {i, i + 1};
+        EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs);
+        EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty());
+      } else {
+        EltData0 = B.CreateExtractElement(Data0, i);
+      }
+
+      if (Data1) {
+        if (is16Bit) {
+          int Idxs[2] = {i, i + 1};
+          EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs);
+          EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty());
+        } else {
+          EltData1 = B.CreateExtractElement(Data1, i);
+        }
+      }
+
+      Value *EltResult =
+          buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1);
+
+      if (is16Bit) {
+        EltResult =
+            B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2));
+        for (int j = 0; j < 2; ++j) {
+          if (i + j >= EC)
+            break;
+          Result = B.CreateInsertElement(
+              Result, B.CreateExtractElement(EltResult, j), i + j);
+        }
+      } else {
+        Result = B.CreateInsertElement(Result, EltResult, i);
+      }
+    }
+
+    return Result;
+  }
+
+  unsigned BitWidth = DL->getTypeSizeInBits(Ty);
+  Type *IntTy = Ty;
+
+  if (!Ty->isIntegerTy()) {
+    IntTy = IntegerType::get(Mod->getContext(), BitWidth);
+    Data0 = B.CreateBitOrPointerCast(Data0, IntTy);
+    if (Data1)
+      Data1 = B.CreateBitOrPointerCast(Data1, IntTy);
+  }
+
+  if ((BitWidth % 32) != 0) {
+    Type *ExtendedTy =
+        IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31);
+    Data0 = B.CreateZExt(Data0, ExtendedTy);
+    if (Data1)
+      Data1 = B.CreateZExt(Data1, ExtendedTy);
+  }
+
+  if (BitWidth > 32) {
+    Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32);
+    Data0 = B.CreateBitCast(Data0, VecTy);
+    if (Data1)
+      Data1 = B.CreateBitCast(Data1, VecTy);
+  }
+
+  Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1);
+
+  if ((BitWidth % 32) != 0) {
+    if (BitWidth > 32) {
+      Result = B.CreateBitCast(
+          Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32));
+    }
+
+    Result =
+        B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth));
+  }
+
+  return B.CreateBitOrPointerCast(Result, Ty);
+}
+
+/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics
+/// on i32.
+///
+/// Done during codegen prepare purely because this turned out to be simpler
+/// than doing it in this generality in SelectionDAG.
+bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I,
+                                                  Intrinsic::ID CanonicalIID) {
+  Type *Ty = I.getType();
+  if (I.getIntrinsicID() == CanonicalIID && Ty->isIntegerTy(32) &&
+      Ty->getIntegerBitWidth() == 32)
+    return false; // already legal
+
+  Value *Data0 = I.getArgOperand(0);
+  Value *Lane = nullptr;
+  Value *Data1 = nullptr;
+
+  if (CanonicalIID == Intrinsic::amdgcn_readlane2) {
+    Lane = I.getArgOperand(1);
+  } else if (CanonicalIID == Intrinsic::amdgcn_writelane2) {
+    Lane = I.getArgOperand(1);
+    Data1 = I.getArgOperand(2);
+  }
+
+  IRBuilder<> Builder(&I);
+  Value *Legalized =
+      buildLegalLaneIntrinsic(Builder, CanonicalIID, Data0, Lane, Data1);
+
+  I.replaceAllUsesWith(Legalized);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   DL = &Mod->getDataLayout();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -795,9 +795,12 @@
       switch (IntrID) {
         default:
         return false;
+        case Intrinsic::amdgcn_readfirstlane2:
+        case Intrinsic::amdgcn_readlane2:
+          return true;
         case Intrinsic::amdgcn_readfirstlane:
         case Intrinsic::amdgcn_readlane:
-          return true;
+          llvm_unreachable("should have been updated during CodeGenPrepare");
       }
     }
     break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -630,7 +630,26 @@
     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
   }
   case Intrinsic::amdgcn_readfirstlane:
-  case Intrinsic::amdgcn_readlane: {
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane: {
+    // Canonicalize to overloaded intrinsics.
+    Intrinsic::ID NewIID;
+    if (IID == Intrinsic::amdgcn_readfirstlane)
+      NewIID = Intrinsic::amdgcn_readfirstlane2;
+    else if (IID == Intrinsic::amdgcn_readlane)
+      NewIID = Intrinsic::amdgcn_readlane2;
+    else
+      NewIID = Intrinsic::amdgcn_writelane2;
+
+    SmallVector<Value *, 3> Args;
+    for (Value *Arg : II.arg_operands())
+      Args.push_back(Arg);
+    CallInst *UpgradedCall = IC.Builder.CreateIntrinsic(
+        NewIID, {II.getArgOperand(0)->getType()}, Args);
+    return IC.replaceInstUsesWith(II, UpgradedCall);
+  }
+  case Intrinsic::amdgcn_readfirstlane2:
+  case Intrinsic::amdgcn_readlane2: {
     // A constant value is trivially uniform.
     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
       return IC.replaceInstUsesWith(II, C);
@@ -646,18 +665,30 @@
     // readfirstlane (readfirstlane x) -> readfirstlane x
     // readlane (readfirstlane x), y -> readfirstlane x
     if (match(Src,
-              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane2>())) {
       return IC.replaceInstUsesWith(II, Src);
     }
 
-    if (IID == Intrinsic::amdgcn_readfirstlane) {
+    if (IID == Intrinsic::amdgcn_readfirstlane2) {
       // readfirstlane (readlane x, y) -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+      if (match(Src,
+                PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane2>())) {
         return IC.replaceInstUsesWith(II, Src);
       }
+
+      // readfirstlane (bitcast x) -> bitcast (readfirstlane x)
+      Value *BitcastInput = nullptr;
+      if (match(Src,
+                PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) {
+        CallInst *NewCall =
+            IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2,
+                                       {BitcastInput->getType()}, BitcastInput);
+        Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType());
+        return IC.replaceInstUsesWith(II, NewCast);
+      }
     } else {
       // readlane (readlane x, y), y -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane2>(
                          PatternMatch::m_Value(),
                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
         return IC.replaceInstUsesWith(II, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -920,7 +920,7 @@
     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
   case Intrinsic::amdgcn_wwm:
     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
-  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_writelane2:
     return selectWritelane(I);
   case Intrinsic::amdgcn_div_scale:
     return selectDivScale(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -334,7 +334,7 @@
 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
   switch (MI.getIntrinsicID()) {
-  case Intrinsic::amdgcn_readlane: {
+  case Intrinsic::amdgcn_readlane2: {
     static const OpRegBankEntry<3> Table[2] = {
       // Perfectly legal.
       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
@@ -346,7 +346,7 @@
     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
   }
-  case Intrinsic::amdgcn_writelane: {
+  case Intrinsic::amdgcn_writelane2: {
     static const OpRegBankEntry<4> Table[4] = {
       // Perfectly legal.
       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
@@ -2966,7 +2966,7 @@
   }
   case AMDGPU::G_INTRINSIC: {
     switch (MI.getIntrinsicID()) {
-    case Intrinsic::amdgcn_readlane: {
+    case Intrinsic::amdgcn_readlane2: {
       substituteSimpleCopyRegs(OpdMapper, 2);
 
       assert(OpdMapper.getVRegs(0).empty());
@@ -2977,7 +2977,7 @@
       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
       return;
     }
-    case Intrinsic::amdgcn_writelane: {
+    case Intrinsic::amdgcn_writelane2: {
       assert(OpdMapper.getVRegs(0).empty());
       assert(OpdMapper.getVRegs(2).empty());
       assert(OpdMapper.getVRegs(3).empty());
@@ -4115,7 +4115,7 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
       break;
     }
-    case Intrinsic::amdgcn_readlane: {
+    case Intrinsic::amdgcn_readlane2: {
       // This must be an SGPR, but accept a VGPR.
       Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
@@ -4123,14 +4123,14 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
       LLVM_FALLTHROUGH;
     }
-    case Intrinsic::amdgcn_readfirstlane: {
+    case Intrinsic::amdgcn_readfirstlane2: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
       break;
     }
-    case Intrinsic::amdgcn_writelane: {
+    case Intrinsic::amdgcn_writelane2: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       Register SrcReg = MI.getOperand(2).getReg();
       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -251,6 +251,7 @@
 def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
+def : SourceOfDivergence<int_amdgcn_writelane2>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
 
 def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -715,6 +715,8 @@
 
   addPass(createAtomicExpandPass());
 
+  if (EnableAtomicOptimizations)
+    addPass(createAMDGPUAtomicOptimizerPass());
 
   addPass(createAMDGPULowerIntrinsicsPass());
 
@@ -871,10 +873,6 @@
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
-  if (EnableAtomicOptimizations) {
-    addPass(createAMDGPUAtomicOptimizerPass());
-  }
-
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -843,6 +843,8 @@
     switch (Intrinsic->getIntrinsicID()) {
     default:
       return false;
+    case Intrinsic::amdgcn_readfirstlane2:
+    case Intrinsic::amdgcn_readlane2:
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
     case Intrinsic::amdgcn_icmp:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -802,6 +802,8 @@
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3i16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3f16, Custom);
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2203,7 +2203,7 @@
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane2 (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -163,7 +163,7 @@
   InstSI <(outs SReg_32:$vdst),
     (ins VRegOrLds_32:$src0),
     "v_readfirstlane_b32 $vdst, $src0",
-    [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLds_32:$src0)))]>,
+    [(set i32:$vdst, (int_amdgcn_readfirstlane2 (i32 VRegOrLds_32:$src0)))]>,
   Enc32 {
 
   let isCodeGenOnly = 0;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -525,11 +525,11 @@
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+  [(set i32:$vdst, (int_amdgcn_readlane2 i32:$src0, i32:$src1))]>;
 
 let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+  [(set i32:$vdst, (int_amdgcn_writelane2 i32:$src0, i32:$src1, i32:$vdst_in))]>;
 } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
@@ -1,11 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
 # RUN: FileCheck -check-prefix=ERR %s < %t
 
-# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s)
+# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0:sgpr(s32) (in function: readfirstlane_s32_s)
 
 ---
-name: readfirstlane_v
+name: readfirstlane_s32_v
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -13,18 +13,18 @@
 body: |
   bb.0:
     liveins: $vgpr0
-    ; GCN-LABEL: name: readfirstlane_v
+    ; GCN-LABEL: name: readfirstlane_s32_v
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]]
     %0:vgpr(s32) = COPY $vgpr0
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
 
 ---
-name: readfirstlane_v_imm
+name: readfirstlane_v_s32_imm
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -32,19 +32,19 @@
 body: |
   bb.0:
 
-    ; GCN-LABEL: name: readfirstlane_v_imm
+    ; GCN-LABEL: name: readfirstlane_v_s32_imm
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]]
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]]
     ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]]
     %0:vgpr(s32) = G_CONSTANT i32 123
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
 
 # Make sure this fails to select
 ---
-name: readfirstlane_s
+name: readfirstlane_s32_s
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -52,12 +52,12 @@
 body: |
   bb.0:
     liveins: $sgpr0
-    ; GCN-LABEL: name: readfirstlane_s
+    ; GCN-LABEL: name: readfirstlane_s32_s
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+    ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
     ; GCN: S_ENDPGM 0, implicit [[INT]](s32)
     %0:sgpr(s32) = COPY $sgpr0
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
@@ -47,7 +47,7 @@
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK:   liveins: $vgpr0
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   ret i32 %vgpr
@@ -61,9 +61,9 @@
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   ret i64 %vgpr
@@ -77,9 +77,9 @@
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   ret <2 x i32> %vgpr
@@ -92,9 +92,9 @@
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@@ -76,9 +76,9 @@
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
   ; CHECK:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
 main_body:
@@ -91,7 +91,7 @@
   ; CHECK-LABEL: name: non_void_ret
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[C]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
@@ -12,9 +12,9 @@
     ; CHECK-LABEL: name: readfirstlane_s
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
     %0:_(s32) = COPY $sgpr0
-    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
 ...
 
 ---
@@ -26,7 +26,7 @@
     liveins: $vgpr0
     ; CHECK-LABEL: name: readfirstlane_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
@@ -13,10 +13,10 @@
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[COPY1]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -29,10 +29,10 @@
     ; CHECK-LABEL: name: readlane_vs
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -46,10 +46,10 @@
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -64,8 +64,8 @@
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -15,7 +15,7 @@
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -35,10 +35,10 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -61,13 +61,13 @@
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(<3 x s32>), [[UV1:%[0-9]+]]:sgpr(<3 x s32>), [[UV2:%[0-9]+]]:sgpr(<3 x s32>), [[UV3:%[0-9]+]]:sgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>)
   ; CHECK:   [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
   %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -87,28 +87,28 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
-  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32)
   ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32)
   ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32)
   ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32)
   ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
   ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
-  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32)
   ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
   %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -128,52 +128,52 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
-  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32)
   ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32)
   ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32)
   ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32)
   ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
   ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
-  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32)
   ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
-  ; CHECK:   [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
+  ; CHECK:   [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY13]](s32)
   ; CHECK:   $sgpr8 = COPY [[INT8]](s32)
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
-  ; CHECK:   [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
+  ; CHECK:   [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY14]](s32)
   ; CHECK:   $sgpr9 = COPY [[INT9]](s32)
   ; CHECK:   [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
-  ; CHECK:   [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
+  ; CHECK:   [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY15]](s32)
   ; CHECK:   $sgpr10 = COPY [[INT10]](s32)
   ; CHECK:   [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
-  ; CHECK:   [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
+  ; CHECK:   [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY16]](s32)
   ; CHECK:   $sgpr11 = COPY [[INT11]](s32)
   ; CHECK:   [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
-  ; CHECK:   [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
+  ; CHECK:   [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY17]](s32)
   ; CHECK:   $sgpr12 = COPY [[INT12]](s32)
   ; CHECK:   [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
-  ; CHECK:   [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
+  ; CHECK:   [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY18]](s32)
   ; CHECK:   $sgpr13 = COPY [[INT13]](s32)
   ; CHECK:   [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
-  ; CHECK:   [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
+  ; CHECK:   [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY19]](s32)
   ; CHECK:   $sgpr14 = COPY [[INT14]](s32)
   ; CHECK:   [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
-  ; CHECK:   [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
+  ; CHECK:   [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY20]](s32)
   ; CHECK:   $sgpr15 = COPY [[INT15]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
   %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
@@ -14,11 +14,11 @@
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
     ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = COPY $sgpr2
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -32,11 +32,11 @@
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = COPY $vgpr0
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -51,11 +51,11 @@
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
     %2:_(s32) = COPY $vgpr1
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -71,11 +71,11 @@
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
     ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -90,9 +90,9 @@
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
     %2:_(s32) = COPY $vgpr1
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,11 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i32 @llvm.amdgcn.readfirstlane2.i32(i32) #0
+declare float @llvm.amdgcn.readfirstlane2.f32(float) #0
+declare <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half>) #0
+declare <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16>) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.readfirstlane2.i16(i16) #0
+declare half @llvm.amdgcn.readfirstlane2.f16(half) #0
+declare <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float>) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane:
+; CHECK-LABEL: {{^}}test_readfirstlane_i32:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +24,7 @@
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
@@ -25,7 +34,7 @@
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -36,7 +45,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -51,7 +60,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %sgpr)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -62,10 +71,91 @@
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %int)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readfirstlane_f32:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 {
+  %readfirstlane = call float @llvm.amdgcn.readfirstlane2.f32(float %src)
+  store float %readfirstlane, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 {
+  %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half> %src)
+  store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 {
+  %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16> %src)
+  store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_p3:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readfirstlane = call i16 @llvm.amdgcn.readfirstlane2.i16(i16 %src)
+  store i16 %readfirstlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) {
+  %readfirstlane = call half @llvm.amdgcn.readfirstlane2.f16(half %src)
+  store half %readfirstlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v3i16:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16> %src)
+  store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v9f32:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float> %src)
+  store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)*, i32) #0
+declare i16 @llvm.amdgcn.readlane2.i16(i16, i32) #0
+declare half @llvm.amdgcn.readlane2.f16(half, i32) #0
+declare <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16>, i32) #0
+declare <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float>, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
 ; CHECK-NOT: v_readlane_b32
@@ -77,6 +82,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_p3:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)* %src, i32 15)
+  store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_i16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readlane = call i16 @llvm.amdgcn.readlane2.i16(i16 %src, i32 15)
+  store i16 %readlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f16(half addrspace(1)* %out, half %src) {
+  %readlane = call half @llvm.amdgcn.readlane2.f16(half %src, i32 15)
+  store half %readlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v3i16:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readlane = call <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16> %src, i32 15)
+  store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v9f32:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readlane = call <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float> %src, i32 15)
+  store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,6 +3,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.writelane2.i16(i16, i32, i16) #0
+declare half @llvm.amdgcn.writelane2.f16(half, i32, half) #0
+declare <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16>, i32, <3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float>, i32, <9 x float>) #0
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
@@ -79,6 +84,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_writelane_p3:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_i16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %writelane = call i16 @llvm.amdgcn.writelane2.i16(i16 1234, i32 15, i16 %src)
+  store i16 %writelane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_f16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f16(half addrspace(1)* %out, half %src) {
+  %writelane = call half @llvm.amdgcn.writelane2.f16(half 1.0, i32 15, half %src)
+  store half %writelane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v3i16:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %writelane = call <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src)
+  store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v9f32:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %writelane = call <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src)
+  store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2507,8 +2507,8 @@
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2530,8 +2530,8 @@
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0)
@@ -2541,8 +2541,8 @@
 
 define i32 @readfirstlane_readlane(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2552,11 +2552,11 @@
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2570,11 +2570,11 @@
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0)
@@ -2585,6 +2585,41 @@
   ret i32 %read1
 }
 
+define i32 @readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
+define float @bitcast_readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @bitcast_readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  %cast.read = bitcast i32 %read to float
+  ret float %cast.read
+}
+
+define i32 @readfirstlane_bitcast_multi_use(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast_multi_use(
+; CHECK-NEXT:    store float [[ARG:%.*]], float* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  store i32 %bitcast.arg, i32* undef
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.readlane
 ; --------------------------------------------------------------------
@@ -2593,8 +2628,8 @@
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2616,8 +2651,8 @@
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane)
@@ -2626,9 +2661,9 @@
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP1]], i32 [[LANE1:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -2637,8 +2672,8 @@
 
 define i32 @readlane_readfirstlane(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2648,11 +2683,11 @@
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 [[LANE]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2667,11 +2702,11 @@
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)