diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -413,6 +413,18 @@
   /// offset.
   int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
 
+  // Before finalizing the GEP with constant offset, we need to ensure that it
+  // is valid in the addressing modes in which it will be used. We cannot assume
+  // that the addressing mode used in the GEP is the same as the addressing mode
+  // used in all the uses, thus this function parses the recursive uses of the
+  // GEP, checking that the change is a valid addressing mode in all uses
+  bool traceAndCheckGEPUses(Value *V, int64_t AccumulativeByteOffset,
+                            SmallVectorImpl<Instruction *> &Visited);
+
+  // Helper function to trace ptrtoint uses of the GEP
+  bool resolvePtrToInt(Value *V, int64_t AccumulativeByteOffset,
+                       SmallVectorImpl<Instruction *> &Visited);
+
   /// Canonicalize array indices to pointer-size integers. This helps to
   /// simplify the logic of splitting a GEP. For example, if a + b is a
   /// pointer-size integer, we have
@@ -842,6 +854,191 @@
   return AccumulativeByteOffset;
 }
 
+bool SeparateConstOffsetFromGEP::resolvePtrToInt(
+    Value *V, int64_t AccumulativeByteOffset,
+    SmallVectorImpl<Instruction *> &Visited) {
+  typedef function_ref<bool(const Instruction *, const Value *)> SpecialCase;
+  SmallVector<SpecialCase, 4> UnableToTrace;
+  SmallVector<SpecialCase, 4> ShouldNotTrace;
+
+  // If it is used as argument in function call, we can not be sure how it
+  // will be used, we conserivately do not allow this
+  const SpecialCase isArgInCall = [](const Instruction *Inst, const Value *V) {
+    if (const CallBase *CB = dyn_cast<CallBase>(Inst)) {
+      // We are able to reason about intrinsics
+      return CB->getIntrinsicID() == Intrinsic::not_intrinsic;
+    }
+    return false;
+  };
+
+  UnableToTrace.push_back(isArgInCall);
+
+  // We don't need to trace the bool from comparison instructions
+  const SpecialCase isCmpInst = [](const Instruction *Inst, const Value *V) {
+    const CmpInst *CI = dyn_cast<CmpInst>(Inst);
+    return CI;
+  };
+
+  ShouldNotTrace.push_back(isCmpInst);
+
+  bool AllUsesValid = true;
+  auto I = V->use_begin();
+  auto E = V->use_end();
+
+  for (; I != E; I++) {
+    bool InterestingCase = false;
+    Instruction *Inst = dyn_cast<Instruction>(I->getUser());
+    if (std::find(Visited.begin(), Visited.end(), Inst) != Visited.end())
+      continue;
+
+    Visited.push_back(Inst);
+
+    for (auto &F : UnableToTrace) {
+      if (F(Inst, V)) {
+        AllUsesValid = false;
+        InterestingCase = true;
+      }
+    }
+
+    for (auto &F : ShouldNotTrace) {
+      if (F(Inst, V))
+        InterestingCase = true;
+    }
+
+    if (IntToPtrInst *PTI = dyn_cast<IntToPtrInst>(Inst)) {
+      InterestingCase = true;
+      // Trace the pointer
+      AllUsesValid &=
+          traceAndCheckGEPUses(PTI, AccumulativeByteOffset, Visited);
+    }
+
+    if (!InterestingCase)
+      AllUsesValid &= resolvePtrToInt(Inst, AccumulativeByteOffset, Visited);
+
+    if (!AllUsesValid)
+      break;
+  }
+  return AllUsesValid;
+}
+
+bool SeparateConstOffsetFromGEP::traceAndCheckGEPUses(
+    Value *V, int64_t AccumulativeByteOffset,
+    SmallVectorImpl<Instruction *> &Visited) {
+  typedef function_ref<bool(const Instruction *, const Value *)> SpecialCase;
+  SmallVector<SpecialCase, 4> UnableToTrace;
+
+  const SpecialCase isArgInCall = [](const Instruction *Inst, const Value *V) {
+    if (const CallBase *CB = dyn_cast<CallBase>(Inst)) {
+      bool ret = CB->getIntrinsicID() == Intrinsic::not_intrinsic;
+      if (ret) {
+        // If it is not an indirect call, then the ptr must be used as
+        // a function argument
+        if (!CB->isIndirectCall())
+          return true;
+        // Check that the ptr isn't being used as the fptr that is indirectly
+        // called
+        auto ArgMatch = std::find(CB->arg_begin(), CB->arg_end(), V);
+        if (ArgMatch != CB->arg_end() && *ArgMatch != CB->getCalledOperand()) {
+          // The GEP ptr is passed as arg to function
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+
+  UnableToTrace.push_back(isArgInCall);
+
+  SmallVector<SpecialCase, 4> MemoryAccessFromGEP;
+
+  const SpecialCase isReadOrWrite = [](const Instruction *Inst,
+                                       const Value *V) {
+    return Inst->mayReadOrWriteMemory();
+  };
+
+  const SpecialCase isFuncPtrInCall = [](const Instruction *Inst,
+                                         const Value *V) {
+    if (const CallBase *CB = dyn_cast<CallBase>(Inst)) {
+      // Direct calls do not use fptrs
+      if (!CB->isIndirectCall())
+        return false;
+      // Is the value the fptr in indirect call?
+      return V == CB->getCalledOperand();
+    }
+    return false;
+  };
+
+  const SpecialCase isPtrInTerminator = [](const Instruction *Inst,
+                                           const Value *V) {
+    return Inst->isTerminator();
+  };
+
+  MemoryAccessFromGEP.push_back(isReadOrWrite);
+  MemoryAccessFromGEP.push_back(isFuncPtrInCall);
+  MemoryAccessFromGEP.push_back(isPtrInTerminator);
+
+  SmallVector<SpecialCase, 4> ShouldNotTrace;
+
+  const SpecialCase isCmpInst = [](const Instruction *Inst, const Value *V) {
+    const CmpInst *CI = dyn_cast<CmpInst>(Inst);
+    return CI;
+  };
+
+  ShouldNotTrace.push_back(isCmpInst);
+
+  bool AllUsesValid = true;
+  auto I = V->use_begin();
+  auto E = V->use_end();
+  for (; I != E; I++) {
+    Instruction *Inst = dyn_cast<Instruction>(I->getUser());
+    // Avoid infinite loops by not exploring already encountered instructions
+    if (std::find(Visited.begin(), Visited.end(), Inst) != Visited.end())
+      continue;
+    Visited.push_back(Inst);
+    bool InterestingCase = false;
+
+    for (auto &F : UnableToTrace) {
+      if (F(Inst, V)) {
+        InterestingCase = true;
+        AllUsesValid = false;
+      }
+    }
+
+    for (auto &F : ShouldNotTrace) {
+      if (F(Inst, V))
+        InterestingCase = true;
+    }
+
+    for (auto &F : MemoryAccessFromGEP) {
+      // If we don't already have a reason to exit and this use accesses memory
+      // using the GEP address, check the legality
+      if (AllUsesValid && F(Inst, V)) {
+        InterestingCase = true;
+        unsigned AddrSpace = V->getType()->getPointerAddressSpace();
+        TargetTransformInfo &TTI = GetTTI(*Inst->getFunction());
+        bool IsValid = TTI.isLegalAddressingMode(
+            V->getType(),
+            /*BaseGV=*/nullptr, AccumulativeByteOffset,
+            /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace);
+        AllUsesValid &= IsValid;
+      }
+    }
+
+    if (PtrToIntInst *PTI = dyn_cast<PtrToIntInst>(Inst)) {
+      InterestingCase = true;
+      AllUsesValid &= resolvePtrToInt(PTI, AccumulativeByteOffset, Visited);
+    }
+
+    if (!InterestingCase)
+      AllUsesValid &=
+          traceAndCheckGEPUses(Inst, AccumulativeByteOffset, Visited);
+
+    if (!AllUsesValid)
+      break;
+  }
+  return AllUsesValid;
+}
+
 void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
     GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
   IRBuilder<> Builder(Variadic);
@@ -994,6 +1191,11 @@
                                    AddrSpace)) {
       return Changed;
     }
+
+    SmallVector<Instruction *, 32> UseChain;
+    if (!traceAndCheckGEPUses(GEP, AccumulativeByteOffset, UseChain)) {
+      return Changed;
+    }
   }
 
   // Remove the constant offset in each sequential index. The resultant GEP
diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -O3 < %s | FileCheck %s
+
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* nocapture, double) #8
+
+define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, double addrspace(1)* %b, double %c) {
+; CHECK-LABEL: IllegalGEPConst:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    s_add_i32 s0, s2, -1
+; CHECK-NEXT:    s_ashr_i32 s1, s0, 31
+; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; CHECK-NEXT:    s_add_u32 s0, s4, s0
+; CHECK-NEXT:    s_addc_u32 s1, s5, s1
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, -1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
+  %i.4 = addrspacecast double addrspace(1)* %i.3 to double*
+  %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.4, double %c) #23
+  ret void
+}
+
+define protected amdgpu_kernel void @MixedGEP(i32 %a, double addrspace(1)* %b, double %c, double* %d) {
+; CHECK-LABEL: MixedGEP:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT:    s_load_dword s8, s[0:1], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    s_add_i32 s0, s8, -1
+; CHECK-NEXT:    s_ashr_i32 s1, s0, 31
+; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; CHECK-NEXT:    s_add_u32 s0, s4, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    s_addc_u32 s1, s5, s1
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:1
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, -1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
+  br label %bb1
+
+bb1:
+  %i.7 = ptrtoint double addrspace(1)* %i.3 to i64
+  %i.8 = add nsw i64 %i.7, 1
+  %i.9 = inttoptr i64 %i.8 to double addrspace(1)*
+  %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double * %d, double %c) #23
+  %i.11 = addrspacecast double addrspace(1)* %i.9 to double*
+  %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.11, double %c) #23
+  ret void
+}
+
+
+declare double @foo(double addrspace(1) *)
+
+define protected amdgpu_kernel void @GEPAsFnArg(i32 %a, double addrspace(1)* %b) {
+; CHECK-LABEL: GEPAsFnArg:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s38, -1
+; CHECK-NEXT:    s_mov_b32 s39, 0xe00000
+; CHECK-NEXT:    s_add_u32 s36, s36, s11
+; CHECK-NEXT:    s_mov_b32 s14, s10
+; CHECK-NEXT:    s_mov_b32 s12, s8
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; CHECK-NEXT:    s_load_dword s8, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_mov_b32 s13, s9
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s8, s8, -1
+; CHECK-NEXT:    s_ashr_i32 s9, s8, 31
+; CHECK-NEXT:    s_lshl_b64 s[8:9], s[8:9], 3
+; CHECK-NEXT:    s_add_u32 s15, s6, s8
+; CHECK-NEXT:    s_addc_u32 s18, s7, s9
+; CHECK-NEXT:    s_add_u32 s8, s4, 52
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s15
+; CHECK-NEXT:    v_mov_b32_e32 v1, s18
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, -1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
+  %i.4 = call double @foo(double addrspace(1)* %i.3)
+  ret void
+}
+
+declare double @bar(i64)
+
+define protected amdgpu_kernel void @GEPAsIntFnArg(i32 %a, double addrspace(1)* %b) {
+; CHECK-LABEL: GEPAsIntFnArg:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s38, -1
+; CHECK-NEXT:    s_mov_b32 s39, 0xe00000
+; CHECK-NEXT:    s_add_u32 s36, s36, s11
+; CHECK-NEXT:    s_mov_b32 s14, s10
+; CHECK-NEXT:    s_mov_b32 s12, s8
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; CHECK-NEXT:    s_load_dword s8, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_mov_b32 s13, s9
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s8, s8, -1
+; CHECK-NEXT:    s_ashr_i32 s9, s8, 31
+; CHECK-NEXT:    s_lshl_b64 s[8:9], s[8:9], 3
+; CHECK-NEXT:    s_add_u32 s6, s6, s8
+; CHECK-NEXT:    s_addc_u32 s7, s7, s9
+; CHECK-NEXT:    s_lshr_b32 s15, s7, 1
+; CHECK-NEXT:    s_add_u32 s8, s4, 52
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, bar@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, bar@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-NEXT:    v_alignbit_b32 v1, s7, v1, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, -1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
+  %i.4 = ptrtoint double addrspace(1)* %i.3 to i64
+  %i.5 = udiv i64 %i.4, 2
+  %i.6 = call double @bar(i64 %i.5)
+  %i.9 = inttoptr i64 %i.5 to double addrspace(1)*
+  ret void
+}
+
+define protected amdgpu_kernel void @IllegalGEPConstAsFptr(i32 %a, i8 addrspace(1)* %b, i64 %c) {
+; CHECK-LABEL: IllegalGEPConstAsFptr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s38, -1
+; CHECK-NEXT:    s_mov_b32 s39, 0xe00000
+; CHECK-NEXT:    s_add_u32 s36, s36, s11
+; CHECK-NEXT:    s_mov_b32 s14, s10
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_mov_b32 s12, s8
+; CHECK-NEXT:    s_mov_b32 s13, s9
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s6, s6, -1
+; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
+; CHECK-NEXT:    s_add_u32 s16, s16, s6
+; CHECK-NEXT:    s_addc_u32 s17, s17, s7
+; CHECK-NEXT:    s_add_u32 s8, s4, 60
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s18
+; CHECK-NEXT:    v_mov_b32_e32 v1, s19
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, -1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %i.2
+  %i.4 = addrspacecast i8 addrspace(1)* %i.3 to i8*
+  %fct_ptr = bitcast i8* %i.4 to i64 (i64 )*
+  %res = call i64 %fct_ptr(i64 %c)
+  ret void
+}
+
+define protected amdgpu_kernel void @NoInfiniteGEPTracing(i32 %a, double addrspace(1)* %b, double %c) {
+; CHECK-LABEL: NoInfiniteGEPTracing:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_ashr_i32 s3, s2, 31
+; CHECK-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; CHECK-NEXT:    s_add_u32 s0, s4, s0
+; CHECK-NEXT:    s_addc_u32 s1, s5, s1
+; CHECK-NEXT:    s_add_u32 s0, s0, 8
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:  .LBB5_1: ; %bb0
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB5_1
+; CHECK-NEXT:  ; %bb.2: ; %bb1
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %i = add nsw i32 %a, 1
+  %i.2 = sext i32 %i to i64
+  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
+  %i.4 = ptrtoint double addrspace(1)* %i.3 to i64
+  br label %bb0
+
+bb0:
+  %phi = phi double addrspace(1)* [ %i.3, %entry ], [ %i.9, %bb0 ]
+  %i.7 = ptrtoint double addrspace(1)* %phi to i64
+  %i.8 = sub nsw i64 %i.7, 1
+  %cmp2 = icmp eq i64 %i.8, 0
+  %i.9 = inttoptr i64 %i.7 to double addrspace(1)*
+  br i1 %cmp2, label %bb1, label %bb0
+
+bb1:
+  %i.10 = addrspacecast double addrspace(1)* %i.9 to double*
+  %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.10, double %c) #23
+  ret void
+}
+
+
+attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" }
+attributes #23 = { nounwind }
+