diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -292,6 +292,19 @@ // TODO: Most of these cases will return getInvalid in generic code, and // must be implemented here. break; + case TTI::SK_InsertSubvector: + // Example sequence: + // vsetivli zero, 4, e8, mf2, tu, ma (ignored) + // vslideup.vi v8, v9, 2 + return LT.first * getLMULCost(LT.second); + case TTI::SK_Select:{ + // Example sequence: + // li a0, 90 + // vsetivli zero, 8, e8, mf2, ta, ma (ignored) + // vmv.s.x v0, a0 + // vmerge.vvm v8, v9, v8, v0 + return LT.first * 3 * getLMULCost(LT.second); + } case TTI::SK_Broadcast: { bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == Instruction::InsertElement); diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll --- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll @@ -37,7 +37,7 @@ define void @vector_insert_extract( %v0, %v1, <16 x i32> %v2) { ; CHECK-LABEL: 'vector_insert_extract' ; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) -; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call @llvm.vector.extract.nxv4i32.nxv16i32( %v1, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %v0, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll @@ -4,7 +4,7 @@ define <8 x i8> @insert_subvector_middle_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_middle_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -13,7 +13,7 @@ define <8 x i8> @insert_subvector_end_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_end_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -22,7 +22,7 @@ define <8 x i8> @insert_subvector_end_swapped_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_end_swapped_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -31,7 +31,7 @@ define <8 x i8> @insert_subvector_short_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_short_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -40,7 +40,7 @@ define <8 x i8> @insert_subvector_offset_1_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_offset_1_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -48,15 +48,34 @@ } define <8 x i64> @insert_subvector_offset_1_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'insert_subvector_offset_1_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'insert_subvector_offset_1_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'insert_subvector_offset_1_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } +; FIXME: This is expensive and involves vrgathers and vslideups +define <12 x i8> @insert_subvector_concat_v6i8(<6 x i8> %x, <6 x i8> %y) { +; CHECK-LABEL: 'insert_subvector_concat_v6i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <12 x i8> %a +; + %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> + ret <12 x i8> %a +} + +; FIXME: This is a concat is emitted as one vslideup +define <8 x i8> @insert_subvector_concat_v8i8(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: 'insert_subvector_concat_v8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %a +; + %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + ret <8 x i8> %a +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll @@ -5,7 +5,7 @@ ; The mask here interleaves (%v1, %v0), not (%v0, %v1): it should still be cheap. define <4 x i8> @interleave2_v2i8(<2 x i8> %v0, <2 x i8> %v1) { ; CHECK-LABEL: 'interleave2_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = shufflevector <4 x i8> %concat, <4 x i8> poison, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i8> %res ; @@ -16,7 +16,7 @@ define <8 x i8> @interleave2_v8i8(<4 x i8> %v0, <4 x i8> %v1) { ; CHECK-LABEL: 'interleave2_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = shufflevector <8 x i8> %concat, <8 x i8> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; @@ -27,7 +27,7 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'interleave2_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i32> %concat, <8 x i32> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %res ; @@ -39,12 +39,12 @@ ; Should be expensive on RV32 because it can't widen define <8 x i64> @interleave2_v8i64(<4 x i64> %v0, <4 x i64> %v1) { ; RV32-LABEL: 'interleave2_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; ; RV64-LABEL: 'interleave2_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll @@ -4,7 +4,7 @@ define <8 x i8> @select_start_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'select_start_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -13,7 +13,7 @@ define <8 x i8> @select_non_contiguous_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'select_non_contiguous_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -21,27 +21,22 @@ } define <8 x i64> @select_start_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'select_start_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'select_start_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'select_start_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } define <8 x i64> @select_non_contiguous_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'select_non_contiguous_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'select_non_contiguous_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'select_non_contiguous_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll @@ -199,13 +199,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -256,13 +254,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -313,13 +309,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -370,13 +364,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -427,13 +419,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -484,13 +474,11 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[VECINS_31]] ; ; DEFAULT-LABEL: define <4 x float> @int_sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -826,58 +826,24 @@ declare i32 @llvm.abs.i32(i32, i1) -; FIXME: This horizontal reduction occurs because the cost model thinks it can -; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by -; default, tryToVectorizeList fails and we end up with this very expensive -; scalarized load. -; -; This is the code the cost model thinks it's going to generate, which you can -; get by passing -riscv-v-slp-max-vf=0 -; -; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 { -; %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride -; %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride -; %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1 -; %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1 -; %1 = load <2 x i32>, ptr %p, align 4 -; %2 = load <2 x i32>, ptr %q, align 4 -; %x.2 = load i32, ptr %p.2, align 4 -; %y.2 = load i32, ptr %q.2, align 4 -; %x.3 = load i32, ptr %p.3, align 4 -; %y.3 = load i32, ptr %q.3, align 4 -; %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> -; %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2 -; %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3 -; %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> -; %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2 -; %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3 -; %9 = sub <4 x i32> %5, %8 -; %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true) -; %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10) -; ret i32 %11 -; } define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( ; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] ; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] -; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1 -; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 -; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4 -; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4 -; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_2]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X_3]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -912,41 +878,17 @@ ret i32 %sum.2 } -; FIXME: This could be horizontally reduced, as it is functionally equivalent to -; @reduce_sum_2arrays_b define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @reduce_sum_2arrays_a( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X_0:%.*]] = load i8, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X_0]] to i32 -; CHECK-NEXT: [[Y_0:%.*]] = load i8, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[Y_0]] to i32 -; CHECK-NEXT: [[ADD4:%.*]] = add nuw nsw i32 [[CONV]], [[CONV3]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1 -; CHECK-NEXT: [[X_1:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[X_1]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 1 -; CHECK-NEXT: [[Y_1:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[Y_1]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[ADD4]], [[CONV_1]] -; CHECK-NEXT: [[ADD4_1:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV3_1]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[X_2:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[X_2]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 2 -; CHECK-NEXT: [[Y_2:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1 -; CHECK-NEXT: [[CONV3_2:%.*]] = zext i8 [[Y_2]] to i32 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD4_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD4_2:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV3_2]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[X_3:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[X_3]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 3 -; CHECK-NEXT: [[Y_3:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1 -; CHECK-NEXT: [[CONV3_3:%.*]] = zext i8 [[Y_3]] to i32 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD4_2]], [[CONV_3]] -; CHECK-NEXT: [[ADD4_3:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV3_3]] -; CHECK-NEXT: ret i32 [[ADD4_3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -989,13 +931,13 @@ ; CHECK-LABEL: @reduce_sum_2arrays_b( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP4]], [[TMP5]] -; CHECK-NEXT: ret i32 [[OP_RDX]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %0 = load i8, ptr %x, align 1