diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -292,6 +292,18 @@ // TODO: Most of these cases will return getInvalid in generic code, and // must be implemented here. break; + case TTI::SK_Select: + case TTI::SK_InsertSubvector: { + // Example sequence: + // vsetivli zero, 8, e8, mf2, ta, ma (ignored) + // li a0, 225 + // vmv.s.x v0, a0 + // vslideup.vi v10, v9, 1 (only needed if index != 0) + // vmerge.vvm v8, v10, v8, v0 + // ret + int SlideCost = (Kind == TTI::SK_InsertSubvector && Index == 0) ? 0 : 1; + return LT.first * (3 + SlideCost) * getLMULCost(LT.second); + } case TTI::SK_Broadcast: { bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == Instruction::InsertElement); diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll --- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll @@ -37,7 +37,7 @@ define void @vector_insert_extract( %v0, %v1, <16 x i32> %v2) { ; CHECK-LABEL: 'vector_insert_extract' ; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) -; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call @llvm.vector.extract.nxv4i32.nxv16i32( %v1, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %v0, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert.ll @@ -4,7 +4,7 @@ define <8 x i8> @insert_subvector_middle_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_middle_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -13,7 +13,7 @@ define <8 x i8> @insert_subvector_end_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_end_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -22,7 +22,7 @@ define <8 x i8> @insert_subvector_end_swapped_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_end_swapped_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -31,7 +31,7 @@ define <8 x i8> @insert_subvector_short_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_short_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -40,7 +40,7 @@ define <8 x i8> @insert_subvector_offset_1_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'insert_subvector_offset_1_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -48,15 +48,34 @@ } define <8 x i64> @insert_subvector_offset_1_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'insert_subvector_offset_1_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'insert_subvector_offset_1_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'insert_subvector_offset_1_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } +; FIXME: This is expensive and involves vrgathers and vslideups +define <12 x i8> @insert_subvector_concat_v6i8(<6 x i8> %x, <6 x i8> %y) { +; CHECK-LABEL: 'insert_subvector_concat_v6i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <12 x i8> %a +; + %a = shufflevector <6 x i8> %x, <6 x i8> %y, <12 x i32> + ret <12 x i8> %a +} + +; FIXME: This is a concat is emitted as one vslideup +define <8 x i8> @insert_subvector_concat_v8i8(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: 'insert_subvector_concat_v8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %a +; + %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + ret <8 x i8> %a +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-interleave.ll @@ -5,7 +5,7 @@ ; The mask here interleaves (%v1, %v0), not (%v0, %v1): it should still be cheap. define <4 x i8> @interleave2_v2i8(<2 x i8> %v0, <2 x i8> %v1) { ; CHECK-LABEL: 'interleave2_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <2 x i8> %v0, <2 x i8> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = shufflevector <4 x i8> %concat, <4 x i8> poison, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i8> %res ; @@ -16,7 +16,7 @@ define <8 x i8> @interleave2_v8i8(<4 x i8> %v0, <4 x i8> %v1) { ; CHECK-LABEL: 'interleave2_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %concat = shufflevector <4 x i8> %v0, <4 x i8> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = shufflevector <8 x i8> %concat, <8 x i8> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; @@ -27,7 +27,7 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'interleave2_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %concat = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i32> %concat, <8 x i32> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %res ; @@ -39,12 +39,12 @@ ; Should be expensive on RV32 because it can't widen define <8 x i64> @interleave2_v8i64(<4 x i64> %v0, <4 x i64> %v1) { ; RV32-LABEL: 'interleave2_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; ; RV64-LABEL: 'interleave2_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %concat = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %concat, <8 x i64> poison, <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll @@ -4,7 +4,7 @@ define <8 x i8> @select_start_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'select_start_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -13,7 +13,7 @@ define <8 x i8> @select_non_contiguous_v8i8(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: 'select_non_contiguous_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %res ; %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -21,27 +21,22 @@ } define <8 x i64> @select_start_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'select_start_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'select_start_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'select_start_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } define <8 x i64> @select_non_contiguous_v8i64(<8 x i64> %v, <8 x i64> %w) { -; RV32-LABEL: 'select_non_contiguous_v8i64' -; RV32-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res -; -; RV64-LABEL: 'select_non_contiguous_v8i64' -; RV64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res +; CHECK-LABEL: 'select_non_contiguous_v8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res ; %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> ret <8 x i64> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -882,24 +882,20 @@ ; CHECK-LABEL: @stride_sum_abs_diff( ; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] ; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] -; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1 -; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 -; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4 -; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4 -; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_2]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X_3]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -939,36 +935,14 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @reduce_sum_2arrays_a( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X_0:%.*]] = load i8, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X_0]] to i32 -; CHECK-NEXT: [[Y_0:%.*]] = load i8, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[Y_0]] to i32 -; CHECK-NEXT: [[ADD4:%.*]] = add nuw nsw i32 [[CONV]], [[CONV3]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1 -; CHECK-NEXT: [[X_1:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[X_1]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 1 -; CHECK-NEXT: [[Y_1:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[Y_1]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[ADD4]], [[CONV_1]] -; CHECK-NEXT: [[ADD4_1:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV3_1]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[X_2:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[X_2]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 2 -; CHECK-NEXT: [[Y_2:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1 -; CHECK-NEXT: [[CONV3_2:%.*]] = zext i8 [[Y_2]] to i32 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD4_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD4_2:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV3_2]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[X_3:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[X_3]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 3 -; CHECK-NEXT: [[Y_3:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1 -; CHECK-NEXT: [[CONV3_3:%.*]] = zext i8 [[Y_3]] to i32 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD4_2]], [[CONV_3]] -; CHECK-NEXT: [[ADD4_3:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV3_3]] -; CHECK-NEXT: ret i32 [[ADD4_3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1011,13 +985,13 @@ ; CHECK-LABEL: @reduce_sum_2arrays_b( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP4]], [[TMP5]] -; CHECK-NEXT: ret i32 [[OP_RDX]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %0 = load i8, ptr %x, align 1