diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -121,6 +121,7 @@ bool IsLittle; + bool StreamingModeDisabled; unsigned MinSVEVectorSizeInBits; unsigned MaxSVEVectorSizeInBits; unsigned VScaleForTuning = 2; @@ -158,7 +159,8 @@ const std::string &TuneCPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian, unsigned MinSVEVectorSizeInBitsOverride = 0, - unsigned MaxSVEVectorSizeInBitsOverride = 0); + unsigned MaxSVEVectorSizeInBitsOverride = 0, + bool StreamingModeDisabled = true); // Getters for SubtargetFeatures defined in tablegen #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ @@ -198,6 +200,9 @@ bool isXRaySupported() const override { return true; } unsigned getMinVectorRegisterBitWidth() const { + // Don't assume any minimum vector size when PSTATE.SM may not be 0. + if (!isStreamingModeDisabled()) + return 0; return MinVectorRegisterBitWidth; } @@ -385,6 +390,7 @@ return "__security_check_cookie"; } + bool isStreamingModeDisabled() const { return StreamingModeDisabled; } }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -274,12 +274,14 @@ const std::string &FS, const TargetMachine &TM, bool LittleEndian, unsigned MinSVEVectorSizeInBitsOverride, - unsigned MaxSVEVectorSizeInBitsOverride) + unsigned MaxSVEVectorSizeInBitsOverride, + bool StreamingModeDisabled) : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), IsLittle(LittleEndian), + StreamingModeDisabled(StreamingModeDisabled), MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -388,6 +388,11 @@ SmallString<512> Key; + bool StreamingModeDisabled = + !F.hasFnAttribute("aarch64_pstate_sm_enabled") && + !F.hasFnAttribute("aarch64_pstate_sm_compatible") && + !F.hasFnAttribute("aarch64_pstate_sm_body"); + unsigned MinSVEVectorSize = 0; unsigned MaxSVEVectorSize = 0; Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange); @@ -420,6 +425,7 @@ Key += "SVEMin"; Key += std::to_string(MinSVEVectorSize); Key += "SVEMax"; + Key += "StreamingModeDisabled=" + std::to_string(StreamingModeDisabled); Key += std::to_string(MaxSVEVectorSize); Key += CPU; Key += TuneCPU; @@ -431,9 +437,9 @@ // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = std::make_unique(TargetTriple, CPU, TuneCPU, FS, - *this, isLittle, MinSVEVectorSize, - MaxSVEVectorSize); + I = std::make_unique( + TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize, + MaxSVEVectorSize, StreamingModeDisabled); } return I.get(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -120,20 +120,7 @@ std::function SimplifyAndSetOp) const; - TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { - switch (K) { - case TargetTransformInfo::RGK_Scalar: - return TypeSize::getFixed(64); - case TargetTransformInfo::RGK_FixedWidthVector: - if (ST->hasSVE()) - return TypeSize::getFixed( - std::max(ST->getMinSVEVectorSizeInBits(), 128u)); - return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); - case TargetTransformInfo::RGK_ScalableVector: - return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); - } - llvm_unreachable("Unsupported register kind"); - } + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; unsigned getMinVectorRegisterBitWidth() const { return ST->getMinVectorRegisterBitWidth(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -106,6 +106,18 @@ "recurrences"), cl::location(TailFoldingKindLoc)); +// Experimental option that will only be fully functional when the +// code-generator is changed to use SVE instead of NEON for all fixed-width +// operations. +static cl::opt EnableFixedwidthAutovecInStreamingMode( + "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); + +// Experimental option that will only be fully functional when the cost-model +// and code-generator have been changed to avoid using scalable vector +// instructions that are not legal in streaming SVE mode. +static cl::opt EnableScalableAutovecInStreamingMode( + "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller); @@ -1487,6 +1499,30 @@ return None; } +TypeSize +AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(64); + case TargetTransformInfo::RGK_FixedWidthVector: + if (!ST->isStreamingModeDisabled() && + !EnableFixedwidthAutovecInStreamingMode) + return TypeSize::getFixed(0); + + if (ST->hasSVE()) + return TypeSize::getFixed( + std::max(ST->getMinSVEVectorSizeInBits(), 128u)); + + return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); + case TargetTransformInfo::RGK_ScalableVector: + if (!ST->isStreamingModeDisabled() && !EnableScalableAutovecInStreamingMode) + return TypeSize::getScalable(0); + + return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); + } + llvm_unreachable("Unsupported register kind"); +} + bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, ArrayRef Args) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll @@ -0,0 +1,114 @@ +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH + +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH + +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE +; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH + +target triple = "aarch64-unknown-linux-gnu" + +attributes #0 = { vscale_range(1,16) "target-features"="+neon,+sme,+sve2" "REPLACE_PSTATE_MACRO" } + +define void @test_fixedwidth_loopvec(ptr noalias %dst, ptr readonly %src, i32 %N) #0 { +; CHECK-LABEL: @test_fixedwidth_loopvec +; CHECK-NOT: <{{[1-9]+}} x i32> +; CHECK-FORCE-FIXEDWIDTH-LABEL: @test_fixedwidth_loopvec +; CHECK-FORCE-FIXEDWIDTH: <{{[1-9]+}} x i32> +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, 42 + %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +!0 = distinct !{!0, !1, !2, !3} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.interleave.count", i32 1} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} + +define void @test_scalable_loopvec(ptr noalias %dst, ptr readonly %src, i32 %N) #0 { +; CHECK-LABEL: @test_scalable_loopvec +; CHECK-NOT: +; CHECK-FORCE-SCALABLE-LABEL: @test_fixedwidth_loopvec +; CHECK-FORCE-SCALABLE-LABEL: +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, 42 + %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !4 +} + +!4 = distinct !{!4, !5, !6, !7} +!5 = !{!"llvm.loop.mustprogress"} +!6 = !{!"llvm.loop.interleave.count", i32 1} +!7 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +define void @test_slp(ptr noalias %dst, ptr readonly %src, i32 %N) #0 { +; CHECK-LABEL: @test_slp +; CHECK-NOT: <{{[1-9]+}} x i32> +; CHECK-FORCE-FIXEDWIDTH-LABEL: @test_slp +; CHECK-FORCE-FIXEDWIDTH: <{{[1-9]+}} x i32> +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, 42 + %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !8 +} + +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.interleave.count", i32 4} +!11 = !{!"llvm.loop.vectorize.width", i32 1}