Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -33,6 +33,7 @@
 private:
   const TargetMachine *TM;
   Module *Mod;
+  const DataLayout *DL;
   MDNode *MaxWorkGroupSizeRange;
 
   // FIXME: This should be per-kernel.
@@ -44,6 +45,20 @@
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
 
+  /// BaseAlloca is the alloca root the search started from.
+  /// Val may be that alloca or a recursive user of it.
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
+
+  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+  /// Returns true if both operands are derived from the same alloca. Val should
+  /// be the same value as one of the input operands of UseInst.
+  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
+
 public:
   static char ID;
 
@@ -51,6 +66,7 @@
     FunctionPass(ID),
     TM(TM_),
     Mod(nullptr),
+    DL(nullptr),
     MaxWorkGroupSizeRange(nullptr),
     LocalMemAvailable(0),
     IsAMDGCN(false),
@@ -63,6 +79,11 @@
     return "AMDGPU Promote Alloca";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
   void visitAlloca(AllocaInst &I);
 };
 
@@ -81,6 +102,7 @@
     return false;
 
   Mod = &M;
+  DL = &Mod->getDataLayout();
 
   // The maximum workitem id.
   //
@@ -134,8 +156,7 @@
         if (!Use)
           continue;
         if (Use->getParent()->getParent() == &F)
-          LocalMemAvailable -=
-              Mod->getDataLayout().getTypeAllocSize(GV->getValueType());
+          LocalMemAvailable -= DL->getTypeAllocSize(GV->getValueType());
       }
     }
   }
@@ -404,7 +425,39 @@
   return true;
 }
 
-static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+                                                          Value *Val,
+                                                          Instruction *Inst,
+                                                          int OpIdx0,
+                                                          int OpIdx1) const {
+  // Figure out which operand is the one we might not be promoting.
+  Value *OtherOp = Inst->getOperand(OpIdx0);
+  if (Val == OtherOp)
+    OtherOp = Inst->getOperand(OpIdx1);
+
+  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  if (!isa<AllocaInst>(OtherObj))
+    return false;
+
+  // TODO: We should be able to replace undefs with the right pointer type.
+
+  // TODO: If we know the other base object is another promotable
+  // alloca, not necessarily this alloca, we can do this. The
+  // important part is both must have the same address space at
+  // the end.
+  if (OtherObj != BaseAlloca) {
+    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+  Value *BaseAlloca,
+  Value *Val,
+  std::vector<Value*> &WorkList) const {
+
   bool Success = true;
   for (User *User : Val->users()) {
     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
@@ -430,12 +483,42 @@
         return false;
     }
 
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (ICmpInst *ICmp = dyn_cast_or_null<ICmpInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+        return false;
+    }
+
     if (!User->getType()->isPointerTy())
       continue;
 
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (SelectInst *SI = dyn_cast_or_null<SelectInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+        return false;
+    }
+
+    // Repeat for phis.
+    if (PHINode *Phi = dyn_cast_or_null<PHINode>(UseInst)) {
+      // TODO: Handle more complex cases. We should be able to replace loops
+      // over arrays.
+      switch (Phi->getNumIncomingValues()) {
+      case 1:
+        break;
+      case 2:
+        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+          return false;
+        break;
+      default:
+        return false;
+      }
+    }
+
     WorkList.push_back(User);
 
-    Success &= collectUsesWithPtrTypes(User, WorkList);
+    Success &= collectUsesWithPtrTypes(BaseAlloca, User, WorkList);
   }
   return Success;
 }
@@ -470,7 +553,7 @@
 
   std::vector<Value*> WorkList;
 
-  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return;
   }
Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This normally would be fixed by instcombine to be compare to the GEP
+; indices
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %b
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
+; CHECK: %alloca = alloca i32, i32 16, align 4
+; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+; CHECK: %ptr1 = call i32* @get_unknown_pointer()
+; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+  %ptr1 = call i32* @get_unknown_pointer()
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -0,0 +1,170 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+
+; CHECK-LABEL: @branch_ptr_var_same_alloca(
+; CHECK: getelementptr [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}}
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @one_phi_value(
+; CHECK: getelementptr [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @alloca.1, i32 0, i32 %14
+; CHECK:  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: br label %exit
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @one_phi_value(i32 %a) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %exit
+
+exit:
+  %phi.ptr = phi i32* [ %arrayidx0, %entry ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @branch_ptr_alloca_unknown_obj(
+; CHECK: %alloca = alloca [64 x i32], align 4
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = call i32* @get_unknown_pointer()
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32* %phi.ptr, align 4
+define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = call i32* @get_unknown_pointer()
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; kernel void ptr_induction_var_same_alloca(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+
+;     #pragma nounroll
+;     for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; FIXME: This should be promotable. We need to use
+; GetUnderlyingObjects when looking at the icmp user.
+
+; CHECK-LABEL: @ptr_induction_var_same_alloca(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+define void @ptr_induction_var_same_alloca() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %arrayidx1
+  br i1 %cmp, label %for.cond.cleanup, label %for.body
+}
+
+
+; extern int* get_unknown_pointer(void);
+
+; kernel void ptr_induction_var_alloca_unknown(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+;
+;     for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
+define void @ptr_induction_var_alloca_unknown() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %call = tail call i32* @get_unknown_pointer() #2
+  %cmp.7 = icmp eq i32* %arrayidx, %call
+  br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %call
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -0,0 +1,102 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
+; CHECK: %alloca = alloca i32
+; CHECK: select i1 undef, i32* undef, i32* %alloca
+define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+  %alloca = alloca i32, align 4
+  %select = select i1 undef, i32* undef, i32* %alloca
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %b
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; FIXME: This should be promotable but requires knowing that both will be promoted first.
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
+; CHECK: %alloca0 = alloca i32, i32 16, align 4
+; CHECK: %alloca1 = alloca i32, i32 16, align 4
+; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
+define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+  %alloca0 = alloca i32, i32 16, align 4
+  %alloca1 = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; TODO: Maybe this should be canonicalized to select on the constant and GPE after.
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca.1, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 1
+; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 3
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 1
+  %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 3
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
+; CHECK: getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca.2, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %b
+; CHECK: %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %c
+; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
+; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
+define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b
+  %ptr2 = getelementptr inbounds i32, i32* %alloca, i32 %c
+  %select0 = select i1 undef, i32* %ptr0, i32* %ptr1
+  %select1 = select i1 undef, i32* %select0, i32* %ptr2
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %alloca = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b
+  store i32 0, i32* %ptr0
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %ptr2 = getelementptr inbounds i32, i32* %alloca, i32 %c
+  %select0 = select i1 undef, i32* undef, i32* %ptr2
+  store i32 0, i32* %ptr1
+  br label %bb2
+
+bb2:
+  %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
+  %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind }