diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -821,4 +821,96 @@ ret i64 %add.15 } +declare i32 @llvm.abs.i32(i32, i1) +; FIXME: This horizontal reduction occurs because the cost model thinks it can +; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by +; default, tryToVectorizeList fails and we end up with this very expensive +; scalarized load. +; +; This is the code the cost model thinks it's going to generate, which you can +; get by passing -riscv-v-slp-max-vf=0 +; +; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 { +; %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride +; %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride +; %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1 +; %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1 +; %1 = load <2 x i32>, ptr %p, align 4 +; %2 = load <2 x i32>, ptr %q, align 4 +; %x.2 = load i32, ptr %p.2, align 4 +; %y.2 = load i32, ptr %q.2, align 4 +; %x.3 = load i32, ptr %p.3, align 4 +; %y.3 = load i32, ptr %q.3, align 4 +; %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> +; %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2 +; %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3 +; %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> +; %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2 +; %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3 +; %9 = sub <4 x i32> %5, %8 +; %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true) +; %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10) +; ret i32 %11 +; } +define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { +; CHECK-LABEL: @stride_sum_abs_diff( +; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[Q_1:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 1 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] +; CHECK-NEXT: [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1 +; CHECK-NEXT: [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1 +; CHECK-NEXT: [[X_0:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: [[Y_0:%.*]] = load i32, ptr [[Q]], align 4 +; CHECK-NEXT: [[X_1:%.*]] = load i32, ptr [[P_1]], align 4 +; CHECK-NEXT: [[Y_1:%.*]] = load i32, ptr [[Q_1]], align 4 +; CHECK-NEXT: [[X_2:%.*]] = load i32, ptr [[P_2]], align 4 +; CHECK-NEXT: [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4 +; CHECK-NEXT: [[X_3:%.*]] = load i32, ptr [[P_3]], align 4 +; CHECK-NEXT: [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X_1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X_2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y_0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y_1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] +; + %x.0 = load i32, ptr %p + %y.0 = load i32, ptr %q + %sub.0 = sub i32 %x.0, %y.0 + %abs.0 = tail call i32 @llvm.abs.i32(i32 %sub.0, i1 true) + + %p.1 = getelementptr inbounds i32, ptr %p, i64 1 + %x.1 = load i32, ptr %p.1 + %q.1 = getelementptr inbounds i32, ptr %q, i64 1 + %y.1 = load i32, ptr %q.1 + %sub.1 = sub i32 %x.1, %y.1 + %abs.1 = tail call i32 @llvm.abs.i32(i32 %sub.1, i1 true) + %sum.0 = add i32 %abs.0, %abs.1 + + %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride + %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride + + %x.2 = load i32, ptr %p.2 + %y.2 = load i32, ptr %q.2 + %sub.2 = sub i32 %x.2, %y.2 + %abs.2 = tail call i32 @llvm.abs.i32(i32 %sub.2, i1 true) + %sum.1 = add i32 %sum.0, %abs.2 + + %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1 + %x.3 = load i32, ptr %p.3 + %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1 + %y.3 = load i32, ptr %q.3 + %sub.3 = sub i32 %x.3, %y.3 + %abs.3 = tail call i32 @llvm.abs.i32(i32 %sub.3, i1 true) + %sum.2 = add i32 %sum.1, %abs.3 + + ret i32 %sum.2 +}