diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -356,10 +356,7 @@ return MinSVEVectorSizeInBits; } - bool useSVEForFixedLengthVectors() const { - // Prefer NEON unless larger SVE registers are available. - return hasSVE() && getMinSVEVectorSizeInBits() >= 256; - } + bool useSVEForFixedLengthVectors() const; unsigned getVScaleForTuning() const { return VScaleForTuning; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -65,6 +65,9 @@ "Should only be used for testing register allocator."), cl::CommaSeparated, cl::Hidden); +static cl::opt ForceSVEFor128bitVectors("force-sve-128bit-vector", + cl::init(false), cl::Hidden); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -428,3 +431,11 @@ } bool AArch64Subtarget::useAA() const { return UseAA; } + +bool AArch64Subtarget::useSVEForFixedLengthVectors() const { + if (ForceSVEFor128bitVectors) + return hasSVE(); + + // Prefer NEON unless larger SVE registers are available. + return hasSVE() && getMinSVEVectorSizeInBits() >= 256; +} diff --git a/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll b/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --force-sve-128bit-vector < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; load v16i8 +define <16 x i8> @masked_load_v16i8(<16 x i8>* %src, <16 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer) + ret <16 x i8> %load +} +; store v16i8 +define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: st1b { z1.b }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, <16 x i8>* %dst, i32 8, <16 x i1> %mask) + ret void +} + +; load 4xfloat +define <4 x float> @masked_load_v4f32(<4 x float>* %src, <4 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load +} + +; store v4f32 +define void @masked_store_v4f32(<4 x float>* %dst, <4 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, <4 x float>* %dst, i32 8, <4 x i1> %mask) + ret void +} + +; load v2f64 +define <2 x double> @masked_load_v2f64(<2 x double>* %src, <2 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer) + ret <2 x double> %load +} + +; store v2f64 +define void @masked_store_v2f64(<2 x double>* %dst, <2 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, <2 x double>* %dst, i32 8, <2 x i1> %mask) + ret void +} + +attributes #0 = { "target-features"="+sve" } + +declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) + +declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) + +declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)