Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -130,6 +130,34 @@
 
 void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
+
+  // This is true if loads have amdgpu.uniform on them (as opposed to GEP
+  // which might be dropped by InstCombine if the offset is 0).
+  if (I.getMetadata("amdgpu.uniform")) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      // If GEP is uniform, we don't need to do anything.
+      if (GEP->getMetadata("amdgpu.uniform"))
+        return;
+
+      // Set Uniform on the GEP if it only has 1 use.
+      if (GEP->hasOneUse()) {
+        setUniformMetadata(GEP);
+        return;
+      }
+    }
+
+    // There is no GEP or the GEP has multiple uses. We'll have to create
+    // a new one.
+    Value *Idx = Constant::getIntegerValue(
+                   Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+    Instruction *GEP = GetElementPtrInst::Create(
+             Ptr->getType()->getPointerElementType(), Ptr,
+             ArrayRef<Value*>(Idx), Twine(""), &I);
+    I.replaceUsesOfWith(Ptr, GEP);
+    setUniformMetadata(GEP);
+    return;
+  }
+
   if (!DA->isUniform(Ptr))
     return;
   auto isGlobalLoad = [&](LoadInst &Load)->bool {
Index: test/CodeGen/AMDGPU/amdgpu.uniform.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/amdgpu.uniform.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIGFX9 %s
+
+; GCN-LABEL: {{^}}uniform_load:
+; GCN: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; SI-NEXT: s_nop
+; GCN-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+define amdgpu_vs i32 @uniform_load(i32 %p) #0 {
+  %p64 = zext i32 %p to i64
+  %ptr = inttoptr i64 %p64 to i32 addrspace(2)*
+  %r = load i32, i32 addrspace(2)* %ptr, !amdgpu.uniform !0
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}gep_uniform_load:
+; GCN: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; SI-NEXT: s_nop
+; SICI-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x1
+; VIGFX9-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x4
+define amdgpu_vs i32 @gep_uniform_load(i32 %p) #0 {
+  %p64 = zext i32 %p to i64
+  %ptr = inttoptr i64 %p64 to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %ptr, i32 1
+  %r = load i32, i32 addrspace(2)* %gep, !amdgpu.uniform !0
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}uniform_gep_load:
+; GCN: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; SI-NEXT: s_nop
+; SICI-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x1
+; VIGFX9-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x4
+define amdgpu_vs i32 @uniform_gep_load(i32 %p) #0 {
+  %p64 = zext i32 %p to i64
+  %ptr = inttoptr i64 %p64 to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %ptr, i32 1, !amdgpu.uniform !0
+  %r = load i32, i32 addrspace(2)* %gep
+  ret i32 %r
+}
+
+; GCN-LABEL: {{^}}uniform_gep_uniform_load:
+; GCN: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; SI-NEXT: s_nop
+; SICI-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x1
+; VIGFX9-NEXT: s_load_dword s{{[0-9]}}, s[0:1], 0x4
+define amdgpu_vs i32 @uniform_gep_uniform_load(i32 %p) #0 {
+  %p64 = zext i32 %p to i64
+  %ptr = inttoptr i64 %p64 to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %ptr, i32 1, !amdgpu.uniform !0
+  %r = load i32, i32 addrspace(2)* %gep, !amdgpu.uniform !0
+  ret i32 %r
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{}