diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -124,7 +124,8 @@ bool IsLittle; - bool StreamingSVEModeDisabled; + bool StreamingSVEMode; + bool StreamingCompatibleSVEMode; unsigned MinSVEVectorSizeInBits; unsigned MaxSVEVectorSizeInBits; unsigned VScaleForTuning = 2; @@ -163,7 +164,8 @@ StringRef FS, const TargetMachine &TM, bool LittleEndian, unsigned MinSVEVectorSizeInBitsOverride = 0, unsigned MaxSVEVectorSizeInBitsOverride = 0, - bool StreamingSVEModeDisabled = true); + bool StreamingSVEMode = false, + bool StreamingCompatibleSVEMode = false); // Getters for SubtargetFeatures defined in tablegen #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ @@ -202,6 +204,12 @@ bool isXRaySupported() const override { return true; } + /// Returns true if the function has the streaming attribute. + bool isStreaming() const { return StreamingSVEMode; } + + /// Returns true if the function has the streaming-compatible attribute. + bool isStreamingCompatible() const { return StreamingCompatibleSVEMode; } + /// Returns true if the target has NEON and the function at runtime is known /// to have NEON enabled (e.g. the function is known not to be in streaming-SVE /// mode, which disables NEON instructions). @@ -209,7 +217,7 @@ unsigned getMinVectorRegisterBitWidth() const { // Don't assume any minimum vector size when PSTATE.SM may not be 0. - if (!isStreamingSVEModeDisabled()) + if (StreamingSVEMode || StreamingCompatibleSVEMode) return 0; return MinVectorRegisterBitWidth; } @@ -416,8 +424,6 @@ return "__security_check_cookie_arm64ec"; return "__security_check_cookie"; } - - bool isStreamingSVEModeDisabled() const { return StreamingSVEModeDisabled; } }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -292,13 +292,15 @@ const TargetMachine &TM, bool LittleEndian, unsigned MinSVEVectorSizeInBitsOverride, unsigned MaxSVEVectorSizeInBitsOverride, - bool StreamingSVEModeDisabled) + bool StreamingSVEMode, + bool StreamingCompatibleSVEMode) : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), IsLittle(LittleEndian), - StreamingSVEModeDisabled(StreamingSVEModeDisabled), + StreamingSVEMode(StreamingSVEMode), + StreamingCompatibleSVEMode(StreamingCompatibleSVEMode), MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), @@ -479,5 +481,10 @@ if (!hasNEON()) return false; - return !ForceStreamingCompatibleSVE; + // The 'force-streaming-comaptible-sve' flag overrides the streaming + // function attributes. + if (ForceStreamingCompatibleSVE.getNumOccurrences() > 0) + return !ForceStreamingCompatibleSVE; + + return !isStreaming() && !isStreamingCompatible(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -391,10 +391,10 @@ StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString() : CPU; StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS; - bool StreamingSVEModeDisabled = - !F.hasFnAttribute("aarch64_pstate_sm_enabled") && - !F.hasFnAttribute("aarch64_pstate_sm_compatible") && - !F.hasFnAttribute("aarch64_pstate_sm_body"); + bool StreamingSVEMode = F.hasFnAttribute("aarch64_pstate_sm_enabled") || + F.hasFnAttribute("aarch64_pstate_sm_body"); + bool StreamingCompatibleSVEMode = + F.hasFnAttribute("aarch64_pstate_sm_compatible"); unsigned MinSVEVectorSize = 0; unsigned MaxSVEVectorSize = 0; @@ -427,8 +427,11 @@ SmallString<512> Key; raw_svector_ostream(Key) << "SVEMin" << MinSVEVectorSize << "SVEMax" - << MaxSVEVectorSize << "StreamingSVEModeDisabled=" - << StreamingSVEModeDisabled << CPU << TuneCPU << FS; + << MaxSVEVectorSize + << "StreamingSVEMode=" << StreamingSVEMode + << "StreamingCompatibleSVEMode=" + << StreamingCompatibleSVEMode << CPU << TuneCPU + << FS; auto &I = SubtargetMap[Key]; if (!I) { @@ -438,8 +441,14 @@ resetTargetOptions(F); I = std::make_unique( TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize, - MaxSVEVectorSize, StreamingSVEModeDisabled); + MaxSVEVectorSize, StreamingSVEMode, StreamingCompatibleSVEMode); } + + assert((!StreamingSVEMode || I->hasSME()) && + "Expected SME to be available"); + assert((!StreamingCompatibleSVEMode || I->hasSVEorSME()) && + "Expected SVE or SME to be available"); + return I.get(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1925,8 +1925,7 @@ case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(64); case TargetTransformInfo::RGK_FixedWidthVector: - if (!ST->isStreamingSVEModeDisabled() && - !EnableFixedwidthAutovecInStreamingMode) + if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode) return TypeSize::getFixed(0); if (ST->hasSVE()) @@ -1935,7 +1934,8 @@ return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); case TargetTransformInfo::RGK_ScalableVector: - if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode) + if ((ST->isStreaming() || ST->isStreamingCompatible()) && + !EnableScalableAutovecInStreamingMode) return TypeSize::getScalable(0); return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -123,34 +123,42 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind #0 { ; CHECK-LABEL: streaming_compatible_with_neon_vectors: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz x19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: bl normal_callee_vec_arg -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz x19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call <2 x double> @normal_callee_vec_arg(<2 x double> %arg) %fadd = fadd <2 x double> %res, %arg diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible-sve -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE +; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible-sve -enable-fixedwidth-autovec-in-streaming-mode -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE ; RUN: opt < %s -passes=loop-vectorize -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=NO_SC_SVE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"