diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -148,6 +148,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); + bool prefersVectorizedAddressing() const; + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1980,6 +1980,10 @@ return Options; } +bool AArch64TTIImpl::prefersVectorizedAddressing() const { + return ST->hasSVE(); +} + InstructionCost AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -mtriple=aarch64--linux-gnu -mattr=+neon -force-vector-interleave=1 -S -o - | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=aarch64--linux-gnu -mattr=+sve -force-vector-interleave=1 -S -o - | FileCheck --check-prefix=SVE %s + +%struct.stu = type { [128 x double], [128 x double], [128 x double], [128 x double] } + +define dso_local double @test(double* nocapture readonly %data, i32* nocapture readonly %offset, %struct.stu* nocapture readonly %param) local_unnamed_addr { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[OFFSET:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[OFFSET]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[DATA:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[DATA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = load double, double* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> [[TMP12]], double [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_STU:%.*]], %struct.stu* [[PARAM:%.*]], i64 0, i32 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8 +; +; SVE-LABEL: @test( +; SVE-NEXT: entry: +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 128, [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SVE: vector.ph: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 128, [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 128, [[N_MOD_VF]] +; SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; SVE: vector.body: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[OFFSET:%.*]], i64 [[TMP4]] +; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; SVE-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to * +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; SVE-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[DATA:%.*]], [[TMP8]] +; SVE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( [[TMP9]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_STU:%.*]], %struct.stu* [[PARAM:%.*]], i64 0, i32 0, i64 [[TMP4]] +; SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0 +; SVE-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to * +; SVE-NEXT: [[WIDE_LOAD4:%.*]] = load , * [[TMP12]], align 8 +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %add.lcssa = phi double [ %add, %for.body ] + %add8.lcssa = phi double [ %add8, %for.body ] + %add12.lcssa = phi double [ %add12, %for.body ] + %add16.lcssa = phi double [ %add16, %for.body ] + %add17 = fadd fast double %add8.lcssa, %add.lcssa + %add18 = fadd fast double %add17, %add12.lcssa + %add19 = fadd fast double %add18, %add16.lcssa + ret double %add19 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %val4.046 = phi double [ 0.000000e+00, %entry ], [ %add16, %for.body ] + %val3.045 = phi double [ 0.000000e+00, %entry ], [ %add12, %for.body ] + %val2.044 = phi double [ 0.000000e+00, %entry ], [ %add8, %for.body ] + %val1.043 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %offset, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds double, double* %data, i64 %idxprom1 + %1 = load double, double* %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds %struct.stu, %struct.stu* %param, i64 0, i32 0, i64 %indvars.iv + %2 = load double, double* %arrayidx4, align 8 + %mul = fmul fast double %2, %1 + %add = fadd fast double %mul, %val1.043 + %arrayidx6 = getelementptr inbounds %struct.stu, %struct.stu* %param, i64 0, i32 1, i64 %indvars.iv + %3 = load double, double* %arrayidx6, align 8 + %mul7 = fmul fast double %3, %1 + %add8 = fadd fast double %mul7, %val2.044 + %arrayidx10 = getelementptr inbounds %struct.stu, %struct.stu* %param, i64 0, i32 2, i64 %indvars.iv + %4 = load double, double* %arrayidx10, align 8 + %mul11 = fmul fast double %4, %1 + %add12 = fadd fast double %mul11, %val3.045 + %arrayidx14 = getelementptr inbounds %struct.stu, %struct.stu* %param, i64 0, i32 3, i64 %indvars.iv + %5 = load double, double* %arrayidx14, align 8 + %mul15 = fmul fast double %5, %1 + %add16 = fadd fast double %mul15, %val4.046 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -7,10 +7,10 @@ %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 17 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 14 for VF 2 For instruction: {{.*}} load i8 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK: vector.body -; CHECK: load <4 x i8> +; CHECK: load i8 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define void @test(%pair* %p, i64 %n) {