Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -172,6 +172,10 @@ const Function *Callee) const; unsigned getInliningThresholdMultiplier() { return 9; } + + int getArithmeticReductionCost(unsigned Opcode, + Type *Ty, + bool IsPairwise); }; } // end namespace llvm Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -468,6 +468,21 @@ } } +int AMDGPUTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwise) { + EVT OrigTy = TLI->getValueType(DL, Ty); + + // Computes cost on targets that have packed math instructions(which support + // 16-bit types only). + if (IsPairwise || + !ST->hasVOP3PInsts() || + OrigTy.getScalarSizeInBits() != 16) + return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + return LT.first * getFullRateInstrCost(); +} + int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6616,6 +6616,30 @@ return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt); } + // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) + // => + // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) + // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) + // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt + if (Vec.hasOneUse()) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + unsigned Opc = Vec.getOpcode(); + + switch(Opc) { + default: + return SDValue(); + // TODO: Support other binary operations. + case ISD::FADD: + case ISD::ADD: + return DAG.getNode(Opc, SL, EltVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(1), Idx)); + } + } return SDValue(); } Index: llvm/trunk/test/CodeGen/AMDGPU/reduction.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/reduction.ll +++ llvm/trunk/test/CodeGen/AMDGPU/reduction.ll @@ -0,0 +1,128 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}reduction_half4: +; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +define half @reduction_half4(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %bin.rdx = fadd fast <4 x half> %vec4, %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> + %bin.rdx2 = fadd fast <4 x half> %bin.rdx, %rdx.shuf1 + %res = extractelement <4 x half> %bin.rdx2, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_v4i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_add_u16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +define i16 @reduction_v4i16(<4 x i16> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> + %bin.rdx = add <4 x i16> %vec4, %rdx.shuf + %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> + %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1 + %res = extractelement <4 x i16> %bin.rdx2, i32 0 + ret i16 %res +} + +; GCN-LABEL: {{^}}reduction_half8: +; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], [[ADD1:v[0-9]+]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD:v[0-9]+]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], [[ADD]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 + +define half @reduction_half8(<8 x half> %vec8) { +entry: + %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> + %bin.rdx = fadd fast <8 x half> %vec8, %rdx.shuf + %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> + %bin.rdx2 = fadd fast <8 x half> %bin.rdx, %rdx.shuf1 + %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> + %bin.rdx4 = fadd fast <8 x half> %bin.rdx2, %rdx.shuf3 + %res = extractelement <8 x half> %bin.rdx4, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_v8i16: +; GFX9: v_pk_add_u16 [[ADD1]], [[ADD1:v[0-9]+]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD]], [[ADD]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD]], [[ADD]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_u16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 +; VI-NEXT: v_add_u16_e32 + +define i16 @reduction_v8i16(<8 x i16> %vec8) { +entry: + %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> + %bin.rdx = add <8 x i16> %vec8, %rdx.shuf + %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1 + %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> + %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3 + %res = extractelement <8 x i16> %bin.rdx4, i32 0 + ret i16 %res +} + +; GCN-LABEL: {{^}}reduction_half16: +; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD1]], [[ADD1]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD]], [[ADD]], v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD]], [[ADD]], [[ADD1]]{{$}} +; GFX9-NEXT: v_add_f16_sdwa [[ADD]], [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 +; VI-NEXT: v_add_f16_e32 + +define half @reduction_half16(<16 x half> %vec16) { +entry: + %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> + %bin.rdx = fadd fast <16 x half> %vec16, %rdx.shuf + %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> + %bin.rdx2 = fadd fast <16 x half> %bin.rdx, %rdx.shuf1 + %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> + %bin.rdx4 = fadd fast <16 x half> %bin.rdx2, %rdx.shuf3 + %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> + %bin.rdx6 = fadd fast <16 x half> %bin.rdx4, %rdx.shuf5 + %res = extractelement <16 x half> %bin.rdx6, i32 0 + ret half %res +} \ No newline at end of file Index: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -0,0 +1,346 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s + +define half @reduction_half4(<4 x half> %a) { +; GFX9-LABEL: @reduction_half4( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> undef, <4 x i32> +; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0 +; GFX9-NEXT: ret half [[TMP0]] +; +; VI-LABEL: @reduction_half4( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] +; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] +; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] +; VI-NEXT: ret half [[ADD3]] +; +entry: + %elt0 = extractelement <4 x half> %a, i64 0 + %elt1 = extractelement <4 x half> %a, i64 1 + %elt2 = extractelement <4 x half> %a, i64 2 + %elt3 = extractelement <4 x half> %a, i64 3 + + %add1 = fadd fast half %elt1, %elt0 + %add2 = fadd fast half %elt2, %add1 + %add3 = fadd fast half %elt3, %add2 + + ret half %add3 +} + +define half @reduction_half8(<8 x half> %vec8) { +; GFX9-LABEL: @reduction_half8( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0 +; GFX9-NEXT: ret half [[TMP0]] +; +; VI-LABEL: @reduction_half8( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3 +; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 +; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 +; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 +; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 +; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] +; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] +; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] +; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]] +; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]] +; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]] +; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]] +; VI-NEXT: ret half [[ADD7]] +; +entry: + %elt0 = extractelement <8 x half> %vec8, i64 0 + %elt1 = extractelement <8 x half> %vec8, i64 1 + %elt2 = extractelement <8 x half> %vec8, i64 2 + %elt3 = extractelement <8 x half> %vec8, i64 3 + %elt4 = extractelement <8 x half> %vec8, i64 4 + %elt5 = extractelement <8 x half> %vec8, i64 5 + %elt6 = extractelement <8 x half> %vec8, i64 6 + %elt7 = extractelement <8 x half> %vec8, i64 7 + + %add1 = fadd fast half %elt1, %elt0 + %add2 = fadd fast half %elt2, %add1 + %add3 = fadd fast half %elt3, %add2 + %add4 = fadd fast half %elt4, %add3 + %add5 = fadd fast half %elt5, %add4 + %add6 = fadd fast half %elt6, %add5 + %add7 = fadd fast half %elt7, %add6 + + ret half %add7 +} + +define half @reduction_half16(<16 x half> %vec16) { +; GFX9-LABEL: @reduction_half16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> undef, <16 x i32> +; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> undef, <16 x i32> +; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> undef, <16 x i32> +; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]] +; GFX9-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> undef, <16 x i32> +; GFX9-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0 +; GFX9-NEXT: ret half [[TMP0]] +; +; VI-LABEL: @reduction_half16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3 +; VI-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4 +; VI-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5 +; VI-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6 +; VI-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7 +; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8 +; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9 +; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10 +; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11 +; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12 +; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13 +; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14 +; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15 +; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] +; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] +; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] +; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]] +; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]] +; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]] +; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]] +; VI-NEXT: [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]] +; VI-NEXT: [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]] +; VI-NEXT: [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]] +; VI-NEXT: [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]] +; VI-NEXT: [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]] +; VI-NEXT: [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]] +; VI-NEXT: [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]] +; VI-NEXT: [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]] +; VI-NEXT: ret half [[ADD15]] +; +entry: + %elt0 = extractelement <16 x half> %vec16, i64 0 + %elt1 = extractelement <16 x half> %vec16, i64 1 + %elt2 = extractelement <16 x half> %vec16, i64 2 + %elt3 = extractelement <16 x half> %vec16, i64 3 + %elt4 = extractelement <16 x half> %vec16, i64 4 + %elt5 = extractelement <16 x half> %vec16, i64 5 + %elt6 = extractelement <16 x half> %vec16, i64 6 + %elt7 = extractelement <16 x half> %vec16, i64 7 + %elt8 = extractelement <16 x half> %vec16, i64 8 + %elt9 = extractelement <16 x half> %vec16, i64 9 + %elt10 = extractelement <16 x half> %vec16, i64 10 + %elt11 = extractelement <16 x half> %vec16, i64 11 + %elt12 = extractelement <16 x half> %vec16, i64 12 + %elt13 = extractelement <16 x half> %vec16, i64 13 + %elt14 = extractelement <16 x half> %vec16, i64 14 + %elt15 = extractelement <16 x half> %vec16, i64 15 + + %add1 = fadd fast half %elt1, %elt0 + %add2 = fadd fast half %elt2, %add1 + %add3 = fadd fast half %elt3, %add2 + %add4 = fadd fast half %elt4, %add3 + %add5 = fadd fast half %elt5, %add4 + %add6 = fadd fast half %elt6, %add5 + %add7 = fadd fast half %elt7, %add6 + %add8 = fadd fast half %elt8, %add7 + %add9 = fadd fast half %elt9, %add8 + %add10 = fadd fast half %elt10, %add9 + %add11 = fadd fast half %elt11, %add10 + %add12 = fadd fast half %elt12, %add11 + %add13 = fadd fast half %elt13, %add12 + %add14 = fadd fast half %elt14, %add13 + %add15 = fadd fast half %elt15, %add14 + + ret half %add15 +} + +; FIXME: support vectorization; +define half @reduction_sub_half4(<4 x half> %a) { +; GCN-LABEL: @reduction_sub_half4( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; GCN-NEXT: [[ADD1:%.*]] = fsub fast half [[ELT1]], [[ELT0]] +; GCN-NEXT: [[ADD2:%.*]] = fsub fast half [[ELT2]], [[ADD1]] +; GCN-NEXT: [[ADD3:%.*]] = fsub fast half [[ELT3]], [[ADD2]] +; GCN-NEXT: ret half [[ADD3]] +; +entry: + %elt0 = extractelement <4 x half> %a, i64 0 + %elt1 = extractelement <4 x half> %a, i64 1 + %elt2 = extractelement <4 x half> %a, i64 2 + %elt3 = extractelement <4 x half> %a, i64 3 + + %add1 = fsub fast half %elt1, %elt0 + %add2 = fsub fast half %elt2, %add1 + %add3 = fsub fast half %elt3, %add2 + + ret half %add3 +} + +define i16 @reduction_v4i16(<4 x i16> %a) { +; GFX9-LABEL: @reduction_v4i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> undef, <4 x i32> +; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_v4i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3 +; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]] +; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]] +; VI-NEXT: ret i16 [[ADD3]] +; +entry: + %elt0 = extractelement <4 x i16> %a, i64 0 + %elt1 = extractelement <4 x i16> %a, i64 1 + %elt2 = extractelement <4 x i16> %a, i64 2 + %elt3 = extractelement <4 x i16> %a, i64 3 + + %add1 = add i16 %elt1, %elt0 + %add2 = add i16 %elt2, %add1 + %add3 = add i16 %elt3, %add2 + + ret i16 %add3 +} + +define i16 @reduction_v8i16(<8 x i16> %vec8) { +; GFX9-LABEL: @reduction_v8i16( +; GFX9-NEXT: entry: +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]] +; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> undef, <8 x i32> +; GFX9-NEXT: [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]] +; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0 +; GFX9-NEXT: ret i16 [[TMP0]] +; +; VI-LABEL: @reduction_v8i16( +; VI-NEXT: entry: +; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0 +; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1 +; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2 +; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3 +; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4 +; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5 +; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6 +; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7 +; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]] +; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]] +; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]] +; VI-NEXT: [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]] +; VI-NEXT: [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]] +; VI-NEXT: [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]] +; VI-NEXT: [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]] +; VI-NEXT: ret i16 [[ADD7]] +; +entry: + %elt0 = extractelement <8 x i16> %vec8, i64 0 + %elt1 = extractelement <8 x i16> %vec8, i64 1 + %elt2 = extractelement <8 x i16> %vec8, i64 2 + %elt3 = extractelement <8 x i16> %vec8, i64 3 + %elt4 = extractelement <8 x i16> %vec8, i64 4 + %elt5 = extractelement <8 x i16> %vec8, i64 5 + %elt6 = extractelement <8 x i16> %vec8, i64 6 + %elt7 = extractelement <8 x i16> %vec8, i64 7 + + %add1 = add i16 %elt1, %elt0 + %add2 = add i16 %elt2, %add1 + %add3 = add i16 %elt3, %add2 + %add4 = add i16 %elt4, %add3 + %add5 = add i16 %elt5, %add4 + %add6 = add i16 %elt6, %add5 + %add7 = add i16 %elt7, %add6 + + ret i16 %add7 +} + +; FIXME: This should be vectorized on GFX9. + +define i16 @reduction_icmp_v4i16(<4 x i16> %vec4) { +; GCN-LABEL: @reduction_icmp_v4i16( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]] +; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]] +; GCN-NEXT: [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]] +; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]] +; GCN-NEXT: ret i16 [[MIN3]] +; +entry: + %elt0 = extractelement <4 x i16> %vec4, i64 0 + %elt1 = extractelement <4 x i16> %vec4, i64 1 + %elt2 = extractelement <4 x i16> %vec4, i64 2 + %elt3 = extractelement <4 x i16> %vec4, i64 3 + + %cmp1 = icmp ult i16 %elt1, %elt0 + %min1 = select i1 %cmp1, i16 %elt1, i16 %elt0 + %cmp2 = icmp ult i16 %elt2, %min1 + %min2 = select i1 %cmp2, i16 %elt2, i16 %min1 + %cmp3 = icmp ult i16 %elt3, %min2 + %min3 = select i1 %cmp3, i16 %elt3, i16 %min2 + + ret i16 %min3 +} + +; Tests to make sure reduction does not kick in. vega does not support packed math for types larger than 16 bits. +define float @reduction_v4float(<4 x float> %a) { +; GCN-LABEL: @reduction_v4float( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[A]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x float> [[A]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x float> [[A]], i64 3 +; GCN-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]] +; GCN-NEXT: [[ADD2:%.*]] = fadd fast float [[ELT2]], [[ADD1]] +; GCN-NEXT: [[ADD3:%.*]] = fadd fast float [[ELT3]], [[ADD2]] +; GCN-NEXT: ret float [[ADD3]] +; +entry: + %elt0 = extractelement <4 x float> %a, i64 0 + %elt1 = extractelement <4 x float> %a, i64 1 + %elt2 = extractelement <4 x float> %a, i64 2 + %elt3 = extractelement <4 x float> %a, i64 3 + + %add1 = fadd fast float %elt1, %elt0 + %add2 = fadd fast float %elt2, %add1 + %add3 = fadd fast float %elt3, %add2 + + ret float %add3 +} \ No newline at end of file