Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -176,6 +176,9 @@ int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise); + int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm, + bool IsUnsigned); }; } // end namespace llvm Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -483,6 +483,22 @@ return LT.first * getFullRateInstrCost(); } +int AMDGPUTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwise, + bool IsUnsigned) { + EVT OrigTy = TLI->getValueType(DL, Ty); + + // Computes cost on targets that have packed math instructions(which support + // 16-bit types only). + if (IsPairwise || + !ST->hasVOP3PInsts() || + OrigTy.getScalarSizeInBits() != 16) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + return LT.first * getHalfRateInstrCost(); +} + int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6620,7 +6620,7 @@ // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt - if (Vec.hasOneUse()) { + if (Vec.hasOneUse() && DCI.isBeforeLegalize()) { SDLoc SL(N); EVT EltVT = N->getValueType(0); SDValue Idx = N->getOperand(1); @@ -6632,6 +6632,12 @@ // TODO: Support other binary operations. case ISD::FADD: case ISD::ADD: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::FMAXNUM: + case ISD::FMINNUM: return DAG.getNode(Opc, SL, EltVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx), Index: test/CodeGen/AMDGPU/reduction.ll =================================================================== --- test/CodeGen/AMDGPU/reduction.ll +++ test/CodeGen/AMDGPU/reduction.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}reduction_half4: -; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -19,8 +19,8 @@ } ; GCN-LABEL: {{^}}reduction_v4i16: -; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_add_u16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 @@ -36,10 +36,10 @@ } ; GCN-LABEL: {{^}}reduction_half8: -; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], [[ADD1:v[0-9]+]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD]], [[ADD1]]{{$}} -; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -62,10 +62,10 @@ } ; GCN-LABEL: {{^}}reduction_v8i16: -; GFX9: v_pk_add_u16 [[ADD1]], [[ADD1:v[0-9]+]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_u16 [[ADD]], [[ADD]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_u16 [[ADD]], [[ADD]], [[ADD1]]{{$}} -; GFX9-NEXT: v_add_u16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_pk_add_u16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 @@ -92,10 +92,10 @@ ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD1]], [[ADD1]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD]], [[ADD]], v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD]], [[ADD]], [[ADD1]]{{$}} -; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_add_f16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -125,4 +125,288 @@ %bin.rdx6 = fadd fast <16 x half> %bin.rdx4, %rdx.shuf5 %res = extractelement <16 x half> %bin.rdx6, i32 0 ret half %res +} + +; GCN-LABEL: {{^}}reduction_min_v4i16: +; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +define i16 @reduction_min_v4i16(<4 x i16> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf + %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 + %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 + ret i16 %res +} + +; GCN-LABEL: {{^}}reduction_umin_v8i16: +; GFX9: v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}} +; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +; VI-NEXT: v_min_u16_e32 +define i16 @reduction_umin_v8i16(<8 x i16> %vec8) { +entry: + %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf + %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf + %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1 + %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4 + %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4 + %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0 + ret i16 %res +} + +; Tests to make sure without slp the number of instructions are more. +; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp: +; GFX9: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_u16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_u16 +; GFX9-NEXT: v_min3_u16 +define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) { +entry: + %elt0 = extractelement <8 x i16> %vec8, i64 0 + %elt1 = extractelement <8 x i16> %vec8, i64 1 + %elt2 = extractelement <8 x i16> %vec8, i64 2 + %elt3 = extractelement <8 x i16> %vec8, i64 3 + %elt4 = extractelement <8 x i16> %vec8, i64 4 + %elt5 = extractelement <8 x i16> %vec8, i64 5 + %elt6 = extractelement <8 x i16> %vec8, i64 6 + %elt7 = extractelement <8 x i16> %vec8, i64 7 + + %cmp0 = icmp ult i16 %elt1, %elt0 + %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 + %cmp1 = icmp ult i16 %elt2, %min1 + %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 + %cmp2 = icmp ult i16 %elt3, %min2 + %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 + + %cmp3 = icmp ult i16 %elt4, %min3 + %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 + %cmp4 = icmp ult i16 %elt5, %min4 + %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 + + %cmp5 = icmp ult i16 %elt6, %min5 + %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 + %cmp6 = icmp ult i16 %elt7, %min6 + %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 + + ret i16 %min7 +} + +; GCN-LABEL: {{^}}reduction_smin_v16i16: +; GFX9: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +; VI-NEXT: v_min_i16_e32 +define i16 @reduction_smin_v16i16(<16 x i16> %vec16) { +entry: + %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf + %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf + %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1 + %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4 + %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4 + %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7 + %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7 + %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0 + ret i16 %res +} + +; Tests to make sure without slp the number of instructions are more. +; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp: +; GFX9: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_lshrrev_b32_e32 +; GFX9-NEXT: v_min3_i16 +; GFX9-NEXT: v_min3_i16 +define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) { +entry: + %elt0 = extractelement <16 x i16> %vec16, i64 0 + %elt1 = extractelement <16 x i16> %vec16, i64 1 + %elt2 = extractelement <16 x i16> %vec16, i64 2 + %elt3 = extractelement <16 x i16> %vec16, i64 3 + %elt4 = extractelement <16 x i16> %vec16, i64 4 + %elt5 = extractelement <16 x i16> %vec16, i64 5 + %elt6 = extractelement <16 x i16> %vec16, i64 6 + %elt7 = extractelement <16 x i16> %vec16, i64 7 + + %elt8 = extractelement <16 x i16> %vec16, i64 8 + %elt9 = extractelement <16 x i16> %vec16, i64 9 + %elt10 = extractelement <16 x i16> %vec16, i64 10 + %elt11 = extractelement <16 x i16> %vec16, i64 11 + %elt12 = extractelement <16 x i16> %vec16, i64 12 + %elt13 = extractelement <16 x i16> %vec16, i64 13 + %elt14 = extractelement <16 x i16> %vec16, i64 14 + %elt15 = extractelement <16 x i16> %vec16, i64 15 + + %cmp0 = icmp slt i16 %elt1, %elt0 + %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 + %cmp1 = icmp slt i16 %elt2, %min1 + %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 + %cmp2 = icmp slt i16 %elt3, %min2 + %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 + + %cmp3 = icmp slt i16 %elt4, %min3 + %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 + %cmp4 = icmp slt i16 %elt5, %min4 + %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 + + %cmp5 = icmp slt i16 %elt6, %min5 + %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 + %cmp6 = icmp slt i16 %elt7, %min6 + %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 + + %cmp7 = icmp slt i16 %elt8, %min7 + %min8 = select i1 %cmp7, i16 %elt8, i16 %min7 + %cmp8 = icmp slt i16 %elt9, %min8 + %min9 = select i1 %cmp8, i16 %elt9, i16 %min8 + + %cmp9 = icmp slt i16 %elt10, %min9 + %min10 = select i1 %cmp9, i16 %elt10, i16 %min9 + %cmp10 = icmp slt i16 %elt11, %min10 + %min11 = select i1 %cmp10, i16 %elt11, i16 %min10 + + %cmp11 = icmp slt i16 %elt12, %min11 + %min12 = select i1 %cmp11, i16 %elt12, i16 %min11 + %cmp12 = icmp slt i16 %elt13, %min12 + %min13 = select i1 %cmp12, i16 %elt13, i16 %min12 + + %cmp13 = icmp slt i16 %elt14, %min13 + %min14 = select i1 %cmp13, i16 %elt14, i16 %min13 + %cmp14 = icmp slt i16 %elt15, %min14 + %min15 = select i1 %cmp14, i16 %elt15, i16 %min14 + + + ret i16 %min15 +} + +; GCN-LABEL: {{^}}reduction_umax_v4i16: +; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_max_u16_e32 +; VI-NEXT: v_max_u16_e32 +; VI-NEXT: v_max_u16_e32 +define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf + %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 + %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 + ret i16 %res +} + +; GCN-LABEL: {{^}}reduction_smax_v4i16: +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_max_i16_e32 +; VI-NEXT: v_max_i16_e32 +; VI-NEXT: v_max_i16_e32 +define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { +entry: + %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf + %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 + %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 + ret i16 %res +} + +; GCN-LABEL: {{^}}reduction_fmax_v4half: +; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_max_f16_e32 +; VI-NEXT: v_max_f16_e32 +; VI-NEXT: v_max_f16_e32 +define half @reduction_fmax_v4half(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast ogt <4 x half> %vec4, %rdx.shuf + %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> + %rdx.minmax.cmp2 = fcmp fast ogt <4 x half> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 + %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_fmin_v4half: +; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_min_f16_e32 +; VI-NEXT: v_min_f16_e32 +; VI-NEXT: v_min_f16_e32 +define half @reduction_fmin_v4half(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast olt <4 x half> %vec4, %rdx.shuf + %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> + %rdx.minmax.cmp2 = fcmp fast olt <4 x half> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 + %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 + ret half %res } \ No newline at end of file Index: test/Transforms/SLPVectorizer/AMDGPU/reduction.ll =================================================================== --- test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -286,22 +286,31 @@ ret i16 %add7 } -; FIXME: This should be vectorized on GFX9. - -define i16 @reduction_icmp_v4i16(<4 x i16> %vec4) { -; GCN-LABEL: @reduction_icmp_v4i16( -; GCN-NEXT: entry: -; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0 -; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1 -; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2 -; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3 -; GCN-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]] -; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]] -; GCN-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]] -; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]] -; GCN-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]] -; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]] -; GCN-NEXT: ret i16 [[MIN3]] +define i16 @reduction_umin_v4i16(<4 x i16> %vec4) { +; GFX9-LABEL: @reduction_umin_v4i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_umin_v4i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3 +; VI-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]] +; VI-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]] +; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]] +; VI-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]] +; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]] +; VI-NEXT: ret i16 [[MIN3]] ; entry: %elt0 = extractelement <4 x i16> %vec4, i64 0 @@ -319,6 +328,373 @@ ret i16 %min3 } +define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) { +; GFX9-LABEL: @reduction_icmp_v8i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_icmp_v8i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3 +; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4 +; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5 +; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6 +; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7 +; VI-NEXT: [[CMP0:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]] +; VI-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]] +; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]] +; VI-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]] +; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]] +; VI-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT4]], [[MIN3]] +; VI-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]] +; VI-NEXT: [[CMP4:%.*]] = icmp ult i16 [[ELT5]], [[MIN4]] +; VI-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]] +; VI-NEXT: [[CMP5:%.*]] = icmp ult i16 [[ELT6]], [[MIN5]] +; VI-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]] +; VI-NEXT: [[CMP6:%.*]] = icmp ult i16 [[ELT7]], [[MIN6]] +; VI-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]] +; VI-NEXT: ret i16 [[MIN7]] +; +entry: + %elt0 = extractelement <8 x i16> %vec8, i64 0 + %elt1 = extractelement <8 x i16> %vec8, i64 1 + %elt2 = extractelement <8 x i16> %vec8, i64 2 + %elt3 = extractelement <8 x i16> %vec8, i64 3 + %elt4 = extractelement <8 x i16> %vec8, i64 4 + %elt5 = extractelement <8 x i16> %vec8, i64 5 + %elt6 = extractelement <8 x i16> %vec8, i64 6 + %elt7 = extractelement <8 x i16> %vec8, i64 7 + + %cmp0 = icmp ult i16 %elt1, %elt0 + %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 + %cmp1 = icmp ult i16 %elt2, %min1 + %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 + %cmp2 = icmp ult i16 %elt3, %min2 + %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 + + %cmp3 = icmp ult i16 %elt4, %min3 + %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 + %cmp4 = icmp ult i16 %elt5, %min4 + %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 + + %cmp5 = icmp ult i16 %elt6, %min5 + %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 + %cmp6 = icmp ult i16 %elt7, %min6 + %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 + + ret i16 %min7 +} + +define i16 @reduction_smin_v16i16(<16 x i16> %vec16) { +; GFX9-LABEL: @reduction_smin_v16i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> undef, <16 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> undef, <16 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> undef, <16 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]] +; GFX9-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> undef, <16 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_smin_v16i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <16 x i16> [[VEC16:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <16 x i16> [[VEC16]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <16 x i16> [[VEC16]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <16 x i16> [[VEC16]], i64 3 +; VI-NEXT: [[ELT4:%.*]] = extractelement <16 x i16> [[VEC16]], i64 4 +; VI-NEXT: [[ELT5:%.*]] = extractelement <16 x i16> [[VEC16]], i64 5 +; VI-NEXT: [[ELT6:%.*]] = extractelement <16 x i16> [[VEC16]], i64 6 +; VI-NEXT: [[ELT7:%.*]] = extractelement <16 x i16> [[VEC16]], i64 7 +; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x i16> [[VEC16]], i64 8 +; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x i16> [[VEC16]], i64 9 +; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x i16> [[VEC16]], i64 10 +; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x i16> [[VEC16]], i64 11 +; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x i16> [[VEC16]], i64 12 +; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x i16> [[VEC16]], i64 13 +; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x i16> [[VEC16]], i64 14 +; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x i16> [[VEC16]], i64 15 +; VI-NEXT: [[CMP0:%.*]] = icmp slt i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]] +; VI-NEXT: [[CMP1:%.*]] = icmp slt i16 [[ELT2]], [[MIN1]] +; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]] +; VI-NEXT: [[CMP2:%.*]] = icmp slt i16 [[ELT3]], [[MIN2]] +; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]] +; VI-NEXT: [[CMP3:%.*]] = icmp slt i16 [[ELT4]], [[MIN3]] +; VI-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]] +; VI-NEXT: [[CMP4:%.*]] = icmp slt i16 [[ELT5]], [[MIN4]] +; VI-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]] +; VI-NEXT: [[CMP5:%.*]] = icmp slt i16 [[ELT6]], [[MIN5]] +; VI-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]] +; VI-NEXT: [[CMP6:%.*]] = icmp slt i16 [[ELT7]], [[MIN6]] +; VI-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]] +; VI-NEXT: [[CMP7:%.*]] = icmp slt i16 [[ELT8]], [[MIN7]] +; VI-NEXT: [[MIN8:%.*]] = select i1 [[CMP7]], i16 [[ELT8]], i16 [[MIN7]] +; VI-NEXT: [[CMP8:%.*]] = icmp slt i16 [[ELT9]], [[MIN8]] +; VI-NEXT: [[MIN9:%.*]] = select i1 [[CMP8]], i16 [[ELT9]], i16 [[MIN8]] +; VI-NEXT: [[CMP9:%.*]] = icmp slt i16 [[ELT10]], [[MIN9]] +; VI-NEXT: [[MIN10:%.*]] = select i1 [[CMP9]], i16 [[ELT10]], i16 [[MIN9]] +; VI-NEXT: [[CMP10:%.*]] = icmp slt i16 [[ELT11]], [[MIN10]] +; VI-NEXT: [[MIN11:%.*]] = select i1 [[CMP10]], i16 [[ELT11]], i16 [[MIN10]] +; VI-NEXT: [[CMP11:%.*]] = icmp slt i16 [[ELT12]], [[MIN11]] +; VI-NEXT: [[MIN12:%.*]] = select i1 [[CMP11]], i16 [[ELT12]], i16 [[MIN11]] +; VI-NEXT: [[CMP12:%.*]] = icmp slt i16 [[ELT13]], [[MIN12]] +; VI-NEXT: [[MIN13:%.*]] = select i1 [[CMP12]], i16 [[ELT13]], i16 [[MIN12]] +; VI-NEXT: [[CMP13:%.*]] = icmp slt i16 [[ELT14]], [[MIN13]] +; VI-NEXT: [[MIN14:%.*]] = select i1 [[CMP13]], i16 [[ELT14]], i16 [[MIN13]] +; VI-NEXT: [[CMP14:%.*]] = icmp slt i16 [[ELT15]], [[MIN14]] +; VI-NEXT: [[MIN15:%.*]] = select i1 [[CMP14]], i16 [[ELT15]], i16 [[MIN14]] +; VI-NEXT: ret i16 [[MIN15]] +; +entry: + %elt0 = extractelement <16 x i16> %vec16, i64 0 + %elt1 = extractelement <16 x i16> %vec16, i64 1 + %elt2 = extractelement <16 x i16> %vec16, i64 2 + %elt3 = extractelement <16 x i16> %vec16, i64 3 + %elt4 = extractelement <16 x i16> %vec16, i64 4 + %elt5 = extractelement <16 x i16> %vec16, i64 5 + %elt6 = extractelement <16 x i16> %vec16, i64 6 + %elt7 = extractelement <16 x i16> %vec16, i64 7 + + %elt8 = extractelement <16 x i16> %vec16, i64 8 + %elt9 = extractelement <16 x i16> %vec16, i64 9 + %elt10 = extractelement <16 x i16> %vec16, i64 10 + %elt11 = extractelement <16 x i16> %vec16, i64 11 + %elt12 = extractelement <16 x i16> %vec16, i64 12 + %elt13 = extractelement <16 x i16> %vec16, i64 13 + %elt14 = extractelement <16 x i16> %vec16, i64 14 + %elt15 = extractelement <16 x i16> %vec16, i64 15 + + %cmp0 = icmp slt i16 %elt1, %elt0 + %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 + %cmp1 = icmp slt i16 %elt2, %min1 + %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 + %cmp2 = icmp slt i16 %elt3, %min2 + %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 + + %cmp3 = icmp slt i16 %elt4, %min3 + %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 + %cmp4 = icmp slt i16 %elt5, %min4 + %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 + + %cmp5 = icmp slt i16 %elt6, %min5 + %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 + %cmp6 = icmp slt i16 %elt7, %min6 + %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 + + %cmp7 = icmp slt i16 %elt8, %min7 + %min8 = select i1 %cmp7, i16 %elt8, i16 %min7 + %cmp8 = icmp slt i16 %elt9, %min8 + %min9 = select i1 %cmp8, i16 %elt9, i16 %min8 + + %cmp9 = icmp slt i16 %elt10, %min9 + %min10 = select i1 %cmp9, i16 %elt10, i16 %min9 + %cmp10 = icmp slt i16 %elt11, %min10 + %min11 = select i1 %cmp10, i16 %elt11, i16 %min10 + + %cmp11 = icmp slt i16 %elt12, %min11 + %min12 = select i1 %cmp11, i16 %elt12, i16 %min11 + %cmp12 = icmp slt i16 %elt13, %min12 + %min13 = select i1 %cmp12, i16 %elt13, i16 %min12 + + %cmp13 = icmp slt i16 %elt14, %min13 + %min14 = select i1 %cmp13, i16 %elt14, i16 %min13 + %cmp14 = icmp slt i16 %elt15, %min14 + %min15 = select i1 %cmp14, i16 %elt15, i16 %min14 + + + ret i16 %min15 +} + +define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { +; GFX9-LABEL: @reduction_umax_v4i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_umax_v4i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3 +; VI-NEXT: [[CMP1:%.*]] = icmp ugt i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]] +; VI-NEXT: [[CMP2:%.*]] = icmp ugt i16 [[ELT2]], [[MAX1]] +; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]] +; VI-NEXT: [[CMP3:%.*]] = icmp ugt i16 [[ELT3]], [[MAX2]] +; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]] +; VI-NEXT: ret i16 [[MAX3]] +; +entry: + %elt0 = extractelement <4 x i16> %vec4, i64 0 + %elt1 = extractelement <4 x i16> %vec4, i64 1 + %elt2 = extractelement <4 x i16> %vec4, i64 2 + %elt3 = extractelement <4 x i16> %vec4, i64 3 + + %cmp1 = icmp ugt i16 %elt1, %elt0 + %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0 + %cmp2 = icmp ugt i16 %elt2, %max1 + %max2 = select i1 %cmp2, i16 %elt2, i16 %max1 + %cmp3 = icmp ugt i16 %elt3, %max2 + %max3 = select i1 %cmp3, i16 %elt3, i16 %max2 + + ret i16 %max3 +} + +define i16 @reduction_smax_v4i16(<4 x i16> %vec4) { +; GFX9-LABEL: @reduction_smax_v4i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_smax_v4i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3 +; VI-NEXT: [[CMP1:%.*]] = icmp sgt i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]] +; VI-NEXT: [[CMP2:%.*]] = icmp sgt i16 [[ELT2]], [[MAX1]] +; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]] +; VI-NEXT: [[CMP3:%.*]] = icmp sgt i16 [[ELT3]], [[MAX2]] +; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]] +; VI-NEXT: ret i16 [[MAX3]] +; +entry: + %elt0 = extractelement <4 x i16> %vec4, i64 0 + %elt1 = extractelement <4 x i16> %vec4, i64 1 + %elt2 = extractelement <4 x i16> %vec4, i64 2 + %elt3 = extractelement <4 x i16> %vec4, i64 3 + + %cmp1 = icmp sgt i16 %elt1, %elt0 + %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0 + %cmp2 = icmp sgt i16 %elt2, %max1 + %max2 = select i1 %cmp2, i16 %elt2, i16 %max1 + %cmp3 = icmp sgt i16 %elt3, %max2 + %max3 = select i1 %cmp3, i16 %elt3, i16 %max2 + + ret i16 %max3 +} + +define half @reduction_fmax_v4half(<4 x half> %vec4) { +; GFX9-LABEL: @reduction_fmax_v4half( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: ret half [[TMP0]] +; +; VI-LABEL: @reduction_fmax_v4half( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 +; VI-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]] +; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] +; VI-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]] +; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]] +; VI-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]] +; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]] +; VI-NEXT: ret half [[MAX3]] +; +entry: + %elt0 = extractelement <4 x half> %vec4, i64 0 + %elt1 = extractelement <4 x half> %vec4, i64 1 + %elt2 = extractelement <4 x half> %vec4, i64 2 + %elt3 = extractelement <4 x half> %vec4, i64 3 + + %cmp1 = fcmp fast ogt half %elt1, %elt0 + %max1 = select i1 %cmp1, half %elt1, half %elt0 + %cmp2 = fcmp fast ogt half %elt2, %max1 + %max2 = select i1 %cmp2, half %elt2, half %max1 + %cmp3 = fcmp fast ogt half %elt3, %max2 + %max3 = select i1 %cmp3, half %elt3, half %max2 + + ret half %max3 +} + +define half @reduction_fmin_v4half(<4 x half> %vec4) { +; GFX9-LABEL: @reduction_fmin_v4half( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: ret half [[TMP0]] +; +; VI-LABEL: @reduction_fmin_v4half( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3 +; VI-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]] +; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]] +; VI-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]] +; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]] +; VI-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]] +; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]] +; VI-NEXT: ret half [[MIN3]] +; +entry: + %elt0 = extractelement <4 x half> %vec4, i64 0 + %elt1 = extractelement <4 x half> %vec4, i64 1 + %elt2 = extractelement <4 x half> %vec4, i64 2 + %elt3 = extractelement <4 x half> %vec4, i64 3 + + %cmp1 = fcmp fast olt half %elt1, %elt0 + %min1 = select i1 %cmp1, half %elt1, half %elt0 + %cmp2 = fcmp fast olt half %elt2, %min1 + %min2 = select i1 %cmp2, half %elt2, half %min1 + %cmp3 = fcmp fast olt half %elt3, %min2 + %min3 = select i1 %cmp3, half %elt3, half %min2 + + ret half %min3 +} + ; Tests to make sure reduction does not kick in. vega does not support packed math for types larger than 16 bits. define float @reduction_v4float(<4 x float> %a) { ; GCN-LABEL: @reduction_v4float(