Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -111,6 +111,10 @@ case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: case ISD::FPOW: case ISD::FREM: @@ -1986,6 +1990,10 @@ case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: Res = WidenVecRes_Binary(N); break; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2470,9 +2470,18 @@ EVT VT = ValueVTs[0]; LLVMContext &Ctx = *DAG.getContext(); auto &TLI = DAG.getTargetLoweringInfo(); - while (TLI.getTypeAction(Ctx, VT) == TargetLoweringBase::TypeSplitVector) + + // We care about the legality of the operation after it has been type + // legalized. + while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal) VT = TLI.getTypeToTransformTo(Ctx, VT); + // If the vselect is legal, assume we want to leave this as a vector setcc + + // vselect. Otherwise, if this is going to be scalarized, we want to see if + // min/max is legal on the scalar type. + bool UseScalarMinMax = VT.isVector() && + !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT); + Value *LHS, *RHS; auto SPR = matchSelectPattern(const_cast(&I), LHS, RHS); ISD::NodeType Opc = ISD::DELETED_NODE; @@ -2486,11 +2495,17 @@ case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?"); case SPNB_RETURNS_NAN: Opc = ISD::FMINNAN; break; case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break; - case SPNB_RETURNS_ANY: - Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT) ? ISD::FMINNUM - : ISD::FMINNAN; + case SPNB_RETURNS_ANY: { + if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT)) + Opc = ISD::FMINNUM; + else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)) + Opc = ISD::FMINNAN; + else if (UseScalarMinMax) + Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ? + ISD::FMINNUM : ISD::FMINNAN; break; } + } break; case SPF_FMAXNUM: switch (SPR.NaNBehavior) { @@ -2498,18 +2513,27 @@ case SPNB_RETURNS_NAN: Opc = ISD::FMAXNAN; break; case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break; case SPNB_RETURNS_ANY: - Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT) ? ISD::FMAXNUM - : ISD::FMAXNAN; + + if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT)) + Opc = ISD::FMAXNUM; + else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)) + Opc = ISD::FMAXNAN; + else if (UseScalarMinMax) + Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ? + ISD::FMAXNUM : ISD::FMAXNAN; break; } break; default: break; } - if (Opc != ISD::DELETED_NODE && TLI.isOperationLegalOrCustom(Opc, VT) && - // If the underlying comparison instruction is used by any other instruction, - // the consumed instructions won't be destroyed, so it is not profitable - // to convert to a min/max. + if (Opc != ISD::DELETED_NODE && + (TLI.isOperationLegalOrCustom(Opc, VT) || + (UseScalarMinMax && + TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) && + // If the underlying comparison instruction is used by any other + // instruction, the consumed instructions won't be destroyed, so it is + // not profitable to convert to a min/max. cast(&I)->getCondition()->hasOneUse()) { OpCode = Opc; LHSVal = getValue(LHS); Index: test/CodeGen/AMDGPU/fmax_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmax_legacy.ll +++ test/CodeGen/AMDGPU/fmax_legacy.ll @@ -87,6 +87,46 @@ ret void } +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32: +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0 + %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ogt <1 x float> %a, %b + %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, <1 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32: +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-NONAN: v_max_f32_e32 +; SI-NONAN: v_max_f32_e32 +; SI-NONAN: v_max_f32_e32 +define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0 + %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ogt <3 x float> %a, %b + %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, <3 x float> addrspace(1)* %out + ret void +} ; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} Index: test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.ll +++ test/CodeGen/AMDGPU/fmin_legacy.ll @@ -96,6 +96,69 @@ ret void } +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32: +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <1 x float>, <1 x float> addrspace(1)* %gep.0 + %b = load <1 x float>, <1 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <1 x float> %a, %b + %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b + store <1 x float> %val, <1 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32: +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 + +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <2 x float>, <2 x float> addrspace(1)* %gep.0 + %b = load <2 x float>, <2 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + store <2 x float> %val, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32: +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 + +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 + + %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0 + %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1 + + %cmp = fcmp ult <3 x float> %a, %b + %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %val, <3 x float> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 Index: test/CodeGen/AMDGPU/max.ll =================================================================== --- test/CodeGen/AMDGPU/max.ll +++ test/CodeGen/AMDGPU/max.ll @@ -2,7 +2,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -; FUNC-LABEL: @v_test_imax_sge_i32 +; FUNC-LABEL: {{^}}v_test_imax_sge_i32: ; SI: v_max_i32_e32 define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -17,6 +17,24 @@ ret void } +; FUNC-LABEL: {{^}}v_test_imax_sge_v4i32: +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +; SI: v_max_i32_e32 +define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %out, i32 %tid + %a = load <4 x i32>, <4 x i32> addrspace(1)* %gep0, align 4 + %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <4 x i32> %a, %b + %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %val, <4 x i32> addrspace(1)* %outgep, align 4 + ret void +} + ; FUNC-LABEL: @s_test_imax_sge_i32 ; SI: s_max_i32 define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -44,6 +62,15 @@ ret void } +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { + %cmp = icmp sgt <2 x i32> %a, + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + ret void +} ; FUNC-LABEL: @v_test_imax_sgt_i32 ; SI: v_max_i32_e32 define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -92,6 +119,19 @@ ret void } +; FUNC-LABEL: {{^}}s_test_umax_uge_v3i32: +; SI: s_max_u32 +; SI: s_max_u32 +; SI: s_max_u32 +; SI-NOT: s_max_u32 +; SI: s_endpgm +define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { + %cmp = icmp uge <3 x i32> %a, %b + %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b + store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4 + ret void +} + ; FUNC-LABEL: @v_test_umax_ugt_i32 ; SI: v_max_u32_e32 define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -107,7 +147,7 @@ ret void } -; FUNC-LABEL: @s_test_umax_ugt_i32 +; FUNC-LABEL: {{^}}s_test_umax_ugt_i32: ; SI: s_max_u32 define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp ugt i32 %a, %b @@ -116,13 +156,23 @@ ret void } +; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32: +; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15 +; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23 +define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { + %cmp = icmp ugt <2 x i32> %a, + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] +; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI-NEXT: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 @@ -135,13 +185,13 @@ ; Make sure redundant sign_extend_inreg removed. -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI-NEXT: buffer_store_dword [[VMAX]] +define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp sgt i32 %a.ext, %b.ext @@ -152,15 +202,9 @@ ret void } -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sge_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_ge_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +; FUNC-LABEL: {{^}}s_test_imax_sge_i16: +; SI: s_max_i32 +define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sge i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -2,7 +2,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -; FUNC-LABEL: @v_test_imin_sle_i32 +; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; SI: v_min_i32_e32 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -17,7 +17,7 @@ ret void } -; FUNC-LABEL: @s_test_imin_sle_i32 +; FUNC-LABEL: {{^}}s_test_imin_sle_i32: ; SI: s_min_i32 define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sle i32 %a, %b @@ -26,6 +26,63 @@ ret void } +; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32: +; SI: s_min_i32 +define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %cmp = icmp sle <1 x i32> %a, %b + %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b + store <1 x i32> %val, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32: +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %cmp = icmp sle <4 x i32> %a, %b + %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %val, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_i8: +; SI: s_min_i32 +define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { + %cmp = icmp sle i8 %a, %b + %val = select i1 %cmp, i8 %a, i8 %b + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; XXX - should be able to use s_min if we stop unnecessarily doing +; extloads with mubuf instructions. + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind { + %cmp = icmp sle <4 x i8> %a, %b + %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %val, <4 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +; SI: v_min_i32 +define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %cmp = icmp sle <4 x i16> %a, %b + %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %val, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: @v_test_imin_slt_i32 ; SI: v_min_i32_e32 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { @@ -50,6 +107,16 @@ ret void } +; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32: +; SI: s_min_i32 +; SI: s_min_i32 +define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %cmp = icmp slt <2 x i32> %a, %b + %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b + store <2 x i32> %val, <2 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: ; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { @@ -83,6 +150,24 @@ ret void } +; FUNC-LABEL: @v_test_umin_ule_v3i32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI-NOT: v_min_u32_e32 +; SI: s_endpgm +define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep0 + %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep1 + %cmp = icmp ule <3 x i32> %a, %b + %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b + store <3 x i32> %val, <3 x i32> addrspace(1)* %outgep + ret void +} ; FUNC-LABEL: @s_test_umin_ule_i32 ; SI: s_min_u32 define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -137,6 +222,48 @@ ret void } + +; FUNC-LABEL: @s_test_umin_ult_v1i32 +; SI: s_min_u32 +define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %cmp = icmp ult <1 x i32> %a, %b + %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b + store <1 x i32> %val, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32: +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { + %cmp = icmp ult <8 x i32> %a, %b + %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b + store <8 x i32> %val, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %cmp = icmp ult <8 x i16> %a, %b + %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + store <8 x i16> %val, <8 x i16> addrspace(1)* %out + ret void +} + ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb @@ -173,14 +300,8 @@ ret void } -; FIXME: Should get match min/max through extends inserted by -; legalization. - ; FUNC-LABEL: {{^}}s_test_imin_sle_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_le_i32_e32 -; SI: v_cndmask_b32 +; SI: s_min_i32 define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -28,17 +28,10 @@ } ; FUNC-LABEL: {{^}}s_abs_v2i32: -; TODO: this should use s_abs_i32 -; GCNX: s_abs_i32 -; GCNX: s_abs_i32 -; GCN: s_sub -; GCN: s_sub -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN: v_add_i32 -; GCN: v_add_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { %z0 = insertelement <2 x i32> undef, i32 0, i32 0 %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 @@ -55,13 +48,10 @@ ; FUNC-LABEL: {{^}}v_abs_v2i32: ; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] ; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; TODO: this should use v_max_i32 -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 + +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] + ; GCN: v_add_i32 ; GCN: v_add_i32 define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { @@ -80,24 +70,15 @@ ; FUNC-LABEL: {{^}}s_abs_v4i32: ; TODO: this should use s_abs_i32 -; GCNX: s_abs_i32 -; GCNX: s_abs_i32 -; GCNX: s_abs_i32 -; GCNX: s_abs_i32 -; GCN: s_sub -; GCN: s_sub -; GCN: s_sub -; GCN: s_sub -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN: v_add_i32 -; GCN: v_add_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 +; GCN: s_abs_i32 + +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { %z0 = insertelement <4 x i32> undef, i32 0, i32 0 %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 @@ -120,19 +101,12 @@ ; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] ; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] ; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; TODO: this should use v_max_i32 -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCNX: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cmp_gt -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 -; GCN-DAG: v_cndmask_b32 + +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] + ; GCN: v_add_i32 ; GCN: v_add_i32 ; GCN: v_add_i32