diff --git a/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-vector-bits-codegen.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -msve-vector-bits=256 | FileCheck %s --check-prefixes=CHECK,CHECK256 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -msve-vector-bits=512 | FileCheck %s --check-prefixes=CHECK,CHECK512 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -msve-vector-bits=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -O2 -S -o - %s -msve-vector-bits=2048 | FileCheck %s --check-prefixes=CHECK,CHECK2048 + +#include + +void func(int *restrict a, int *restrict b) { +// CHECK-LABEL: func +// CHECK256-COUNT-8: st1w +// CHECK512-COUNT-4: st1w +// CHECK1024-COUNT-2: st1w +// CHECK2048-COUNT-1: st1w +#pragma clang loop vectorize(enable) + for (int i = 0; i < 64; ++i) + a[i] += b[i]; +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -261,6 +261,9 @@ bool IsLittle; + unsigned MinSVEVectorSizeInBits; + unsigned MaxSVEVectorSizeInBits; + /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -291,7 +294,9 @@ /// of the specified triple. AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, - bool LittleEndian); + bool LittleEndian, + unsigned MinSVEVectorSizeInBitsOverride = 0, + unsigned MaxSVEVectorSizeInBitsOverride = 0); const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -47,18 +47,6 @@ cl::desc("Call nonlazybind functions via direct GOT load"), cl::init(false), cl::Hidden); -static cl::opt SVEVectorBitsMax( - "aarch64-sve-vector-bits-max", - cl::desc("Assume SVE vector registers are at most this big, " - "with zero meaning no maximum size is assumed."), - cl::init(0), cl::Hidden); - -static cl::opt SVEVectorBitsMin( - "aarch64-sve-vector-bits-min", - cl::desc("Assume SVE vector registers are at least this big, " - "with zero meaning no minimum size is assumed."), - cl::init(0), cl::Hidden); - static cl::opt UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen.")); @@ -210,14 +198,17 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - const TargetMachine &TM, bool LittleEndian) + const TargetMachine &TM, bool LittleEndian, + unsigned MinSVEVectorSizeInBitsOverride, + unsigned MaxSVEVectorSizeInBitsOverride) : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), IsLittle(LittleEndian), - TargetTriple(TT), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), - TLInfo(TM, *this) { + MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), + MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), + FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), + TSInfo(), TLInfo(TM, *this) { if (AArch64::isX18ReservedByDefault(TT)) ReserveXRegister.set(18); @@ -358,24 +349,12 @@ unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { assert(HasSVE && "Tried to get SVE vector length without SVE support!"); - assert(SVEVectorBitsMax % 128 == 0 && - "SVE requires vector length in multiples of 128!"); - assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && - "Minimum SVE vector size should not be larger than its maximum!"); - if (SVEVectorBitsMax == 0) - return 0; - return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; + return MaxSVEVectorSizeInBits; } unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { assert(HasSVE && "Tried to get SVE vector length without SVE support!"); - assert(SVEVectorBitsMin % 128 == 0 && - "SVE requires vector length in multiples of 128!"); - assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && - "Minimum SVE vector size should not be larger than its maximum!"); - if (SVEVectorBitsMax == 0) - return (SVEVectorBitsMin / 128) * 128; - return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; + return MinSVEVectorSizeInBits; } bool AArch64Subtarget::useSVEForFixedLengthVectors() const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -161,6 +161,18 @@ cl::desc("Enable the AAcrh64 branch target pass"), cl::init(true)); +static cl::opt SVEVectorBitsMaxOpt( + "aarch64-sve-vector-bits-max", + cl::desc("Assume SVE vector registers are at most this big, " + "with zero meaning no maximum size is assumed."), + cl::init(0), cl::Hidden); + +static cl::opt SVEVectorBitsMinOpt( + "aarch64-sve-vector-bits-min", + cl::desc("Assume SVE vector registers are at least this big, " + "with zero meaning no minimum size is assumed."), + cl::init(0), cl::Hidden); + extern cl::opt EnableHomogeneousPrologEpilog; extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { @@ -349,14 +361,54 @@ std::string FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; - auto &I = SubtargetMap[CPU + FS]; + SmallString<512> Key; + + unsigned MinSVEVectorSize = 0; + unsigned MaxSVEVectorSize = 0; + Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange); + if (VScaleRangeAttr.isValid()) { + std::tie(MinSVEVectorSize, MaxSVEVectorSize) = + VScaleRangeAttr.getVScaleRangeArgs(); + MinSVEVectorSize *= 128; + MaxSVEVectorSize *= 128; + } else { + MinSVEVectorSize = SVEVectorBitsMinOpt; + MaxSVEVectorSize = SVEVectorBitsMaxOpt; + } + + assert(MinSVEVectorSize % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert(MaxSVEVectorSize % 128 == 0 && + "SVE requires vector length in multiples of 128!"); + assert((MaxSVEVectorSize >= MinSVEVectorSize || MaxSVEVectorSize == 0) && + "Minimum SVE vector size should not be larger than its maximum!"); + + // Sanitize user input in case of no asserts + if (MaxSVEVectorSize == 0) + MinSVEVectorSize = (MinSVEVectorSize / 128) * 128; + else { + MinSVEVectorSize = + (std::min(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128; + MaxSVEVectorSize = + (std::max(MinSVEVectorSize, MaxSVEVectorSize) / 128) * 128; + } + + Key += "SVEMin"; + Key += std::to_string(MinSVEVectorSize); + Key += "SVEMax"; + Key += std::to_string(MaxSVEVectorSize); + Key += CPU; + Key += FS; + + auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); I = std::make_unique(TargetTriple, CPU, FS, *this, - isLittle); + isLittle, MinSVEVectorSize, + MaxSVEVectorSize); } return I.get(); } diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOARG +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ARG + +target triple = "aarch64-unknown-linux-gnu" + +define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-NOARG-LABEL: func_vscale_none: +; CHECK-NOARG: // %bb.0: +; CHECK-NOARG-NEXT: ldp q0, q1, [x0] +; CHECK-NOARG-NEXT: ldp q2, q3, [x1] +; CHECK-NOARG-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NOARG-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NOARG-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NOARG-NEXT: add v2.4s, v5.4s, v6.4s +; CHECK-NOARG-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NOARG-NEXT: stp q3, q2, [x0, #32] +; CHECK-NOARG-NEXT: stp q0, q1, [x0] +; CHECK-NOARG-NEXT: ret +; +; CHECK-ARG-LABEL: func_vscale_none: +; CHECK-ARG: // %bb.0: +; CHECK-ARG-NEXT: ptrue p0.s, vl16 +; CHECK-ARG-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-ARG-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-ARG-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-ARG-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-ARG-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } + +define void @func_vscale1_1(<16 x i32>* %a, <16 x i32>* %b) #1 { +; CHECK-LABEL: func_vscale1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v2.4s, v5.4s, v6.4s +; CHECK-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NEXT: stp q3, q2, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #1 = { "target-features"="+sve" vscale_range(1,1) } + +define void @func_vscale2_2(<16 x i32>* %a, <16 x i32>* %b) #2 { +; CHECK-LABEL: func_vscale2_2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: add x8, x0, #32 // =32 +; CHECK-NEXT: add x9, x1, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x9] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #2 = { "target-features"="+sve" vscale_range(2,2) } + +define void @func_vscale2_4(<16 x i32>* %a, <16 x i32>* %b) #3 { +; CHECK-LABEL: func_vscale2_4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: add x8, x0, #32 // =32 +; CHECK-NEXT: add x9, x1, #32 // =32 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x9] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #3 = { "target-features"="+sve" vscale_range(2,4) } + +define void @func_vscale4_4(<16 x i32>* %a, <16 x i32>* %b) #4 { +; CHECK-LABEL: func_vscale4_4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #4 = { "target-features"="+sve" vscale_range(4,4) } + +define void @func_vscale8_8(<16 x i32>* %a, <16 x i32>* %b) #5 { +; CHECK-LABEL: func_vscale8_8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +attributes #5 = { "target-features"="+sve" vscale_range(8,8) }