Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2010,6 +2010,11 @@
     return performOrCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
+  case AMDGPUISD::TBUFFER_STORE_FORMAT:
+    // Delete undef stores
+    if (N->getOperand(2).getOpcode() == ISD::UNDEF)
+      return N->getOperand(0);
+    break;
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
Index: test/CodeGen/AMDGPU/tbuffer-dead-store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/tbuffer-dead-store.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN: {{^}}main:
+; GCN: tbuffer_store_format_x v{{[0-9]+}}, 0x0
+; GCN-NOT: tbuffer_store_format
+; GCN: s_endpgm
+
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %12 = getelementptr [9 x <16 x i8>], [9 x <16 x i8>] addrspace(2)* %0, i64 0, i64 0
+  %13 = load <16 x i8>, <16 x i8> addrspace(2)* %12, align 16, !tbaa !0
+  %14 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
+  %15 = load <16 x i8>, <16 x i8> addrspace(2)* %14, align 16, !tbaa !0
+  %16 = add i32 %5, %8
+  %17 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %15, i32 0, i32 %16)
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = bitcast float %18 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %13, i32 %19, i32 1, i32 undef, i32 %7, i32 0, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %13, i32 undef, i32 1, i32 undef, i32 %7, i32 4, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %13, i32 undef, i32 1, i32 undef, i32 %7, i32 8, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %13, i32 undef, i32 1, i32 undef, i32 %7, i32 12, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}