diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64 -slp-vectorizer -S -mcpu=skylake-avx512 | FileCheck %s + +; The test represents a case with multiple vectorization possibilities +; but the most effective way to vectorize it is to match all four 8-way reductions +; feeding the insertelement vector build sequence. + +; Function Attrs: nocallback nofree nosync nounwind willreturn writeonly +declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32 immarg, <4 x i1>) #0 + +define void @test(double* nocapture readonly %arg, double* nocapture readonly %arg1, double* nocapture %arg2) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds double, double* [[ARG2:%.*]], <4 x i64> +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double*> poison, double* [[ARG1:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double*> [[TMP0]], <4 x double*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I16:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 3 +; CHECK-NEXT: [[I34:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 5 +; CHECK-NEXT: [[I35:%.*]] = load double, double* [[I34]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I52:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 7 +; CHECK-NEXT: [[I53:%.*]] = load double, double* [[I52]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I70:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 9 +; CHECK-NEXT: [[I71:%.*]] = load double, double* [[I70]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I88:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 11 +; CHECK-NEXT: [[I89:%.*]] = load double, double* [[I88]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I106:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 13 +; CHECK-NEXT: [[I107:%.*]] = load double, double* [[I106]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I124:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 15 +; CHECK-NEXT: [[I125:%.*]] = load double, double* [[I124]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr double, <4 x double*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[I4:%.*]] = load double, double* [[I3]], align 8 +; CHECK-NEXT: [[I17:%.*]] = load double, double* [[I16]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP1]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> poison, double [[I17]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I4]], i32 1 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[TMP9]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP2]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x double> poison, double [[I4]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x double> [[TMP14]], double [[I17]], i32 1 +; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <4 x double> [[TMP15]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <4 x double> [[TMP13]], [[SHUFFLE3]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP3]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x double> poison, double [[I35]], i32 0 +; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x double> [[TMP19]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fmul fast <4 x double> [[TMP18]], [[SHUFFLE5]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <4 x double> [[TMP17]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP4]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> poison, double [[I53]], i32 0 +; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = fmul fast <4 x double> [[TMP22]], [[SHUFFLE7]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <4 x double> [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP5]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> poison, double [[I71]], i32 0 +; CHECK-NEXT: [[SHUFFLE9:%.*]] = shufflevector <4 x double> [[TMP27]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = fmul fast <4 x double> [[TMP26]], [[SHUFFLE9]] +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <4 x double> [[TMP25]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP6]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x double> poison, double [[I89]], i32 0 +; CHECK-NEXT: [[SHUFFLE11:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fmul fast <4 x double> [[TMP30]], [[SHUFFLE11]] +; CHECK-NEXT: [[TMP33:%.*]] = fadd fast <4 x double> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP7]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x double> poison, double [[I107]], i32 0 +; CHECK-NEXT: [[SHUFFLE13:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = fmul fast <4 x double> [[TMP34]], [[SHUFFLE13]] +; CHECK-NEXT: [[TMP37:%.*]] = fadd fast <4 x double> [[TMP33]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP8]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x double> poison, double [[I125]], i32 0 +; CHECK-NEXT: [[SHUFFLE15:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP40:%.*]] = fmul fast <4 x double> [[TMP38]], [[SHUFFLE15]] +; CHECK-NEXT: [[TMP41:%.*]] = fadd fast <4 x double> [[TMP37]], [[TMP40]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP41]], <4 x double*> [[I]], i32 8, <4 x i1> ) +; CHECK-NEXT: ret void +; +entry: + %i = getelementptr inbounds double, double* %arg2, <4 x i64> + %i3 = getelementptr inbounds double, double* %arg, i64 1 + %i4 = load double, double* %i3, align 8 + %i5 = load double, double* %arg1, align 8 + %i6 = fmul fast double %i5, %i4 + %i7 = getelementptr inbounds double, double* %arg1, i64 16 + %i8 = load double, double* %i7, align 8 + %i9 = fmul fast double %i8, %i4 + %i10 = getelementptr inbounds double, double* %arg1, i64 32 + %i11 = load double, double* %i10, align 8 + %i12 = fmul fast double %i11, %i4 + %i13 = getelementptr inbounds double, double* %arg1, i64 48 + %i14 = load double, double* %i13, align 8 + %i15 = fmul fast double %i14, %i4 + %i16 = getelementptr inbounds double, double* %arg, i64 3 + %i17 = load double, double* %i16, align 8 + %i18 = getelementptr inbounds double, double* %arg1, i64 1 + %i19 = load double, double* %i18, align 8 + %i20 = fmul fast double %i19, %i17 + %i21 = fadd fast double %i6, %i20 + %i22 = getelementptr inbounds double, double* %arg1, i64 17 + %i23 = load double, double* %i22, align 8 + %i24 = fmul fast double %i23, %i17 + %i25 = fadd fast double %i9, %i24 + %i26 = getelementptr inbounds double, double* %arg1, i64 33 + %i27 = load double, double* %i26, align 8 + %i28 = fmul fast double %i27, %i17 + %i29 = fadd fast double %i12, %i28 + %i30 = getelementptr inbounds double, double* %arg1, i64 49 + %i31 = load double, double* %i30, align 8 + %i32 = fmul fast double %i31, %i17 + %i33 = fadd fast double %i15, %i32 + %i34 = getelementptr inbounds double, double* %arg, i64 5 + %i35 = load double, double* %i34, align 8 + %i36 = getelementptr inbounds double, double* %arg1, i64 2 + %i37 = load double, double* %i36, align 8 + %i38 = fmul fast double %i37, %i35 + %i39 = fadd fast double %i21, %i38 + %i40 = getelementptr inbounds double, double* %arg1, i64 18 + %i41 = load double, double* %i40, align 8 + %i42 = fmul fast double %i41, %i35 + %i43 = fadd fast double %i25, %i42 + %i44 = getelementptr inbounds double, double* %arg1, i64 34 + %i45 = load double, double* %i44, align 8 + %i46 = fmul fast double %i45, %i35 + %i47 = fadd fast double %i29, %i46 + %i48 = getelementptr inbounds double, double* %arg1, i64 50 + %i49 = load double, double* %i48, align 8 + %i50 = fmul fast double %i49, %i35 + %i51 = fadd fast double %i33, %i50 + %i52 = getelementptr inbounds double, double* %arg, i64 7 + %i53 = load double, double* %i52, align 8 + %i54 = getelementptr inbounds double, double* %arg1, i64 3 + %i55 = load double, double* %i54, align 8 + %i56 = fmul fast double %i55, %i53 + %i57 = fadd fast double %i39, %i56 + %i58 = getelementptr inbounds double, double* %arg1, i64 19 + %i59 = load double, double* %i58, align 8 + %i60 = fmul fast double %i59, %i53 + %i61 = fadd fast double %i43, %i60 + %i62 = getelementptr inbounds double, double* %arg1, i64 35 + %i63 = load double, double* %i62, align 8 + %i64 = fmul fast double %i63, %i53 + %i65 = fadd fast double %i47, %i64 + %i66 = getelementptr inbounds double, double* %arg1, i64 51 + %i67 = load double, double* %i66, align 8 + %i68 = fmul fast double %i67, %i53 + %i69 = fadd fast double %i51, %i68 + %i70 = getelementptr inbounds double, double* %arg, i64 9 + %i71 = load double, double* %i70, align 8 + %i72 = getelementptr inbounds double, double* %arg1, i64 4 + %i73 = load double, double* %i72, align 8 + %i74 = fmul fast double %i73, %i71 + %i75 = fadd fast double %i57, %i74 + %i76 = getelementptr inbounds double, double* %arg1, i64 20 + %i77 = load double, double* %i76, align 8 + %i78 = fmul fast double %i77, %i71 + %i79 = fadd fast double %i61, %i78 + %i80 = getelementptr inbounds double, double* %arg1, i64 36 + %i81 = load double, double* %i80, align 8 + %i82 = fmul fast double %i81, %i71 + %i83 = fadd fast double %i65, %i82 + %i84 = getelementptr inbounds double, double* %arg1, i64 52 + %i85 = load double, double* %i84, align 8 + %i86 = fmul fast double %i85, %i71 + %i87 = fadd fast double %i69, %i86 + %i88 = getelementptr inbounds double, double* %arg, i64 11 + %i89 = load double, double* %i88, align 8 + %i90 = getelementptr inbounds double, double* %arg1, i64 5 + %i91 = load double, double* %i90, align 8 + %i92 = fmul fast double %i91, %i89 + %i93 = fadd fast double %i75, %i92 + %i94 = getelementptr inbounds double, double* %arg1, i64 21 + %i95 = load double, double* %i94, align 8 + %i96 = fmul fast double %i95, %i89 + %i97 = fadd fast double %i79, %i96 + %i98 = getelementptr inbounds double, double* %arg1, i64 37 + %i99 = load double, double* %i98, align 8 + %i100 = fmul fast double %i99, %i89 + %i101 = fadd fast double %i83, %i100 + %i102 = getelementptr inbounds double, double* %arg1, i64 53 + %i103 = load double, double* %i102, align 8 + %i104 = fmul fast double %i103, %i89 + %i105 = fadd fast double %i87, %i104 + %i106 = getelementptr inbounds double, double* %arg, i64 13 + %i107 = load double, double* %i106, align 8 + %i108 = getelementptr inbounds double, double* %arg1, i64 6 + %i109 = load double, double* %i108, align 8 + %i110 = fmul fast double %i109, %i107 + %i111 = fadd fast double %i93, %i110 + %i112 = getelementptr inbounds double, double* %arg1, i64 22 + %i113 = load double, double* %i112, align 8 + %i114 = fmul fast double %i113, %i107 + %i115 = fadd fast double %i97, %i114 + %i116 = getelementptr inbounds double, double* %arg1, i64 38 + %i117 = load double, double* %i116, align 8 + %i118 = fmul fast double %i117, %i107 + %i119 = fadd fast double %i101, %i118 + %i120 = getelementptr inbounds double, double* %arg1, i64 54 + %i121 = load double, double* %i120, align 8 + %i122 = fmul fast double %i121, %i107 + %i123 = fadd fast double %i105, %i122 + %i124 = getelementptr inbounds double, double* %arg, i64 15 + %i125 = load double, double* %i124, align 8 + %i126 = getelementptr inbounds double, double* %arg1, i64 7 + %i127 = load double, double* %i126, align 8 + %i128 = fmul fast double %i127, %i125 + %rdx1 = fadd fast double %i111, %i128 + %i130 = getelementptr inbounds double, double* %arg1, i64 23 + %i131 = load double, double* %i130, align 8 + %i132 = fmul fast double %i131, %i125 + %rdx2 = fadd fast double %i115, %i132 + %i134 = getelementptr inbounds double, double* %arg1, i64 39 + %i135 = load double, double* %i134, align 8 + %i136 = fmul fast double %i135, %i125 + %rdx3 = fadd fast double %i119, %i136 + %i138 = getelementptr inbounds double, double* %arg1, i64 55 + %i139 = load double, double* %i138, align 8 + %i140 = fmul fast double %i139, %i125 + %rdx4 = fadd fast double %i123, %i140 + %i142 = insertelement <4 x double> poison, double %rdx1, i64 0 + %i143 = insertelement <4 x double> %i142, double %rdx2, i64 1 + %i144 = insertelement <4 x double> %i143, double %rdx3, i64 2 + %i145 = insertelement <4 x double> %i144, double %rdx4, i64 3 + call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %i145, <4 x double*> %i, i32 8, <4 x i1> ) + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind willreturn writeonly }