Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8133,9 +8133,8 @@
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-
   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      !VT.isVector() && VT != MVT::f64 &&
+      !VT.isVector() && VT != MVT::f64 && VT != MVT::i8 &&
       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
Index: test/CodeGen/AMDGPU/min3_i8.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/min3_i8.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: amdgpu_ps_main:
+define amdgpu_ps void @_amdgpu_ps_main(<2 x i32> %arg, i32 %arg1, i32 inreg %arg2) {
+  %tmp = bitcast <2 x i32> %arg to i64
+  %tmp23 = inttoptr i64 %tmp to [4294967295 x i8] addrspace(4)*
+  %tmp30 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp23, i64 0, i64 0
+  %tmp31 = bitcast i8 addrspace(4)* %tmp30 to <4 x i32> addrspace(4)*
+  %tmp37 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp31, align 16
+  %tmp50 = tail call i32 asm sideeffect "; %1", "=v,0"(i32 %arg1)
+  %tmp51 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp50, i32 127)
+  %tmp60 = bitcast i32 %tmp51 to <4 x i8>
+  %tmp61 = extractelement <4 x i8> %tmp60, i64 0
+  %tmp62 = icmp slt i8 0, %tmp61
+  %tmp63 = select i1 %tmp62, i8 0, i8 %tmp61
+  %tmp65 = bitcast i32 %arg1 to <4 x i8>
+  %tmp66 = extractelement <4 x i8> %tmp65, i64 0
+  %tmp67 = icmp slt i8 %tmp63, %tmp66
+  %tmp68 = select i1 %tmp67, i8 %tmp63, i8 %tmp66
+  %tmp69 = insertelement <4 x i8> undef, i8 %tmp68, i64 0
+  %tmp70 = bitcast <4 x i8> %tmp69 to i32
+  %tmp71 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 127, i32 %tmp70, i32 276, i32 15, i32 14, i1 false)
+  %tmp72 = bitcast i32 %tmp71 to <4 x i8>
+  %tmp73 = extractelement <4 x i8> %tmp72, i64 0
+  %tmp74 = icmp slt i8 %tmp68, %tmp73
+  %tmp75 = select i1 %tmp74, i8 %tmp68, i8 %tmp73
+  %tmp99 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %arg1)
+  br label %bb106
+
+bb106:
+  %tmp29 = icmp eq i32 %arg1, 0
+  br i1 %tmp29, label %._crit_edge, label %bb113
+
+bb113:
+  %tmp115 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp37, i32 0, i32 0, i1 false, i1 false)
+  %tmp116 = bitcast float %tmp115 to <4 x i8>
+  %tmp117 = extractelement <4 x i8> %tmp116, i32 0
+  %tmp118 = icmp sgt i8 0, %tmp117
+  br label %._crit_edge
+
+._crit_edge:
+
+; GCN: s_endpgm
+  ret void
+}
+
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
+declare i32 @llvm.amdgcn.wwm.i32(i32)