Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -8133,9 +8133,8 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && VT != MVT::f64 && + !VT.isVector() && VT != MVT::f64 && VT != MVT::i8 && ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) Index: test/CodeGen/AMDGPU/min3_i8.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/min3_i8.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: amdgpu_ps_main: +define amdgpu_ps void @_amdgpu_ps_main(<2 x i32> %arg, i32 %arg1, i32 inreg %arg2) { + %tmp = bitcast <2 x i32> %arg to i64 + %tmp23 = inttoptr i64 %tmp to [4294967295 x i8] addrspace(4)* + %tmp30 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp23, i64 0, i64 0 + %tmp31 = bitcast i8 addrspace(4)* %tmp30 to <4 x i32> addrspace(4)* + %tmp37 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp31, align 16 + %tmp50 = tail call i32 asm sideeffect "; %1", "=v,0"(i32 %arg1) + %tmp51 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp50, i32 127) + %tmp60 = bitcast i32 %tmp51 to <4 x i8> + %tmp61 = extractelement <4 x i8> %tmp60, i64 0 + %tmp62 = icmp slt i8 0, %tmp61 + %tmp63 = select i1 %tmp62, i8 0, i8 %tmp61 + %tmp65 = bitcast i32 %arg1 to <4 x i8> + %tmp66 = extractelement <4 x i8> %tmp65, i64 0 + %tmp67 = icmp slt i8 %tmp63, %tmp66 + %tmp68 = select i1 %tmp67, i8 %tmp63, i8 %tmp66 + %tmp69 = insertelement <4 x i8> undef, i8 %tmp68, i64 0 + %tmp70 = bitcast <4 x i8> %tmp69 to i32 + %tmp71 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 127, i32 %tmp70, i32 276, i32 15, i32 14, i1 false) + %tmp72 = bitcast i32 %tmp71 to <4 x i8> + %tmp73 = extractelement <4 x i8> %tmp72, i64 0 + %tmp74 = icmp slt i8 %tmp68, %tmp73 + %tmp75 = select i1 %tmp74, i8 %tmp68, i8 %tmp73 + %tmp99 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %arg1) + br label %bb106 + +bb106: + %tmp29 = icmp eq i32 %arg1, 0 + br i1 %tmp29, label %._crit_edge, label %bb113 + +bb113: + %tmp115 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp37, i32 0, i32 0, i1 false, i1 false) + %tmp116 = bitcast float %tmp115 to <4 x i8> + %tmp117 = extractelement <4 x i8> %tmp116, i32 0 + %tmp118 = icmp sgt i8 0, %tmp117 + br label %._crit_edge + +._crit_edge: + +; GCN: s_endpgm + ret void +} + +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) +declare i32 @llvm.amdgcn.wwm.i32(i32)