diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -148,6 +148,9 @@ def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; +def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r", + "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">; + def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)", [FeatureSVE, FeatureUseScalarIncVL]>; @@ -1137,7 +1140,8 @@ FeatureLSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, - FeaturePredictableSelectIsExpensive]>; + FeaturePredictableSelectIsExpensive, + FeatureNoSVEFPLD1R]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", "Neoverse V2 ARM processors", [ diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -262,6 +262,8 @@ def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; +def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; + def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2355,13 +2355,15 @@ // LDR1 of 64-bit data defm : LD1RPat; - // LD1R of FP data - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; + let Predicates = [HasSVEorSME, UseSVEFPLD1R] in { + // LD1R of FP data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + } // LD1R of 128-bit masked data multiclass ld1rq_pat{ diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7203,6 +7203,10 @@ def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)), (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv4f16 (op nxv4i1:$pg, f16:$splat, nxv4f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv2f16 (op nxv2i1:$pg, f16:$splat, nxv2f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)), (!cast(NAME # _S) $passthru, $pg, $splat)>; def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)), diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-LD1R +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+no-sve-fp-ld1r < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-LD1R +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NEOVERSE-V1 ; ; Check that ldr1* instruction is generated to splat scalar during load, ; rather than mov from scalar to vector register (which would require the vector unit). @@ -63,12 +65,26 @@ } define @ld1rb_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rb_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #64 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rb_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #64 +; CHECK-LD1R-NEXT: ptrue p0.b +; CHECK-LD1R-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rb_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #64 +; CHECK-NO-LD1R-NEXT: ptrue p0.b +; CHECK-NO-LD1R-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rb_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.b +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #64 +; CHECK-NEOVERSE-V1-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i8, ptr %valp, i32 64 %val = load i8, ptr %valp2 %ins = insertelement undef, i8 %val, i32 0 @@ -77,12 +93,26 @@ } define @ld1rb_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rb_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rb_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #1 +; CHECK-LD1R-NEXT: ptrue p0.b +; CHECK-LD1R-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rb_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: sub x8, x0, #1 +; CHECK-NO-LD1R-NEXT: ptrue p0.b +; CHECK-NO-LD1R-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rb_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.b +; CHECK-NEOVERSE-V1-NEXT: sub x8, x0, #1 +; CHECK-NEOVERSE-V1-NEXT: ld1rb { z0.b }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i8, ptr %valp, i32 -1 %val = load i8, ptr %valp2 %ins = insertelement undef, i8 %val, i32 0 @@ -194,12 +224,26 @@ } define @ld1rh_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rh_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rh_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #128 +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #128 +; CHECK-NO-LD1R-NEXT: ptrue p0.h +; CHECK-NO-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.h +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #128 +; CHECK-NEOVERSE-V1-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i16, ptr %valp, i32 64 %val = load i16, ptr %valp2 %ins = insertelement undef, i16 %val, i32 0 @@ -208,12 +252,26 @@ } define @ld1rh_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rh_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rh_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #2 +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: sub x8, x0, #2 +; CHECK-NO-LD1R-NEXT: ptrue p0.h +; CHECK-NO-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.h +; CHECK-NEOVERSE-V1-NEXT: sub x8, x0, #2 +; CHECK-NEOVERSE-V1-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i16, ptr %valp, i32 -1 %val = load i16, ptr %valp2 %ins = insertelement undef, i16 %val, i32 0 @@ -299,12 +357,26 @@ } define @ld1rw_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rw_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rw_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #256 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #256 +; CHECK-NO-LD1R-NEXT: ptrue p0.s +; CHECK-NO-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.s +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #256 +; CHECK-NEOVERSE-V1-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i32, ptr %valp, i32 64 %val = load i32, ptr %valp2 %ins = insertelement undef, i32 %val, i32 0 @@ -313,12 +385,26 @@ } define @ld1rw_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rw_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rw_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #4 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: sub x8, x0, #4 +; CHECK-NO-LD1R-NEXT: ptrue p0.s +; CHECK-NO-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.s +; CHECK-NEOVERSE-V1-NEXT: sub x8, x0, #4 +; CHECK-NEOVERSE-V1-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i32, ptr %valp, i32 -1 %val = load i32, ptr %valp2 %ins = insertelement undef, i32 %val, i32 0 @@ -378,12 +464,26 @@ } define @ld1rd_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rd_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #512 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rd_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #512 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #512 +; CHECK-NO-LD1R-NEXT: ptrue p0.d +; CHECK-NO-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.d +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #512 +; CHECK-NEOVERSE-V1-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i64, ptr %valp, i32 64 %val = load i64, ptr %valp2 %ins = insertelement undef, i64 %val, i32 0 @@ -392,12 +492,26 @@ } define @ld1rd_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rd_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: ld1rd_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #8 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: sub x8, x0, #8 +; CHECK-NO-LD1R-NEXT: ptrue p0.d +; CHECK-NO-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.d +; CHECK-NEOVERSE-V1-NEXT: sub x8, x0, #8 +; CHECK-NEOVERSE-V1-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr i64, ptr %valp, i32 -1 %val = load i64, ptr %valp2 %ins = insertelement undef, i64 %val, i32 0 @@ -406,11 +520,24 @@ } define @ld1rh_half(ptr %valp) { -; CHECK-LABEL: ld1rh_half: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -418,11 +545,24 @@ } define @ld1rh_half_gep(ptr %valp) { -; CHECK-LABEL: ld1rh_half_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #126] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -431,12 +571,25 @@ } define @ld1rh_half_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rh_half_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #128 +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #128] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -445,12 +598,25 @@ } define @ld1rh_half_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rh_half_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #2 +; CHECK-LD1R-NEXT: ptrue p0.h +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur h0, [x0, #-2] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -459,11 +625,24 @@ } define @ld1rh_half_unpacked4(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked4: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked4: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked4: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -471,11 +650,24 @@ } define @ld1rh_half_unpacked4_gep(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked4_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0, #126] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0, #126] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked4_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #126] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -484,12 +676,25 @@ } define @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #128 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #128] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -498,12 +703,25 @@ } define @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #2 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur h0, [x0, #-2] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -512,11 +730,24 @@ } define @ld1rh_half_unpacked2(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked2: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked2: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked2: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -524,11 +755,24 @@ } define @ld1rh_half_unpacked2_gep(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked2_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0, #126] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0, #126] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked2_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #126] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -537,12 +781,25 @@ } define @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #128 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0, #128] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -551,12 +808,25 @@ } define @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #2 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-NO-LD1R-NEXT: mov z0.h, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur h0, [x0, #-2] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -565,11 +835,24 @@ } define @ld1rw_float(ptr %valp) { -; CHECK-LABEL: ld1rw_float: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load float, ptr %valp %ins = insertelement undef, float %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -577,11 +860,24 @@ } define @ld1rw_float_gep(ptr %valp) { -; CHECK-LABEL: ld1rw_float_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #252] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0, #252] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 63 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -590,12 +886,25 @@ } define @ld1rw_float_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rw_float_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #256 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #256] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0, #256] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 64 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -604,12 +913,25 @@ } define @ld1rw_float_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rw_float_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #4 +; CHECK-LD1R-NEXT: ptrue p0.s +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur s0, [x0, #-4] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur s0, [x0, #-4] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 -1 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -618,11 +940,24 @@ } define @ld1rw_float_unpacked2(ptr %valp) { -; CHECK-LABEL: ld1rw_float_unpacked2: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_unpacked2: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_unpacked2: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load float, ptr %valp %ins = insertelement undef, float %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -630,11 +965,24 @@ } define @ld1rw_float_unpacked2_gep(ptr %valp) { -; CHECK-LABEL: ld1rw_float_unpacked2_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0, #252] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0, #252] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #252] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_unpacked2_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0, #252] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 63 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -643,12 +991,25 @@ } define @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #256 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #256] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0, #256] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 64 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -657,12 +1018,25 @@ } define @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #4 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #4 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur s0, [x0, #-4] +; CHECK-NO-LD1R-NEXT: mov z0.s, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur s0, [x0, #-4] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 -1 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -671,11 +1045,24 @@ } define @ld1rd_double(ptr %valp) { -; CHECK-LABEL: ld1rd_double: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rd_double: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_double: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr d0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.d, d0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_double: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr d0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, d0 +; CHECK-NEOVERSE-V1-NEXT: ret %val = load double, ptr %valp %ins = insertelement undef, double %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -683,11 +1070,24 @@ } define @ld1rd_double_gep(ptr %valp) { -; CHECK-LABEL: ld1rd_double_gep: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rd_double_gep: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_double_gep: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr d0, [x0, #504] +; CHECK-NO-LD1R-NEXT: mov z0.d, d0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_double_gep: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr d0, [x0, #504] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, d0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 63 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -696,12 +1096,25 @@ } define @ld1rd_double_gep_out_of_range_up(ptr %valp) { -; CHECK-LABEL: ld1rd_double_gep_out_of_range_up: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #512 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_up: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #512 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_up: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr d0, [x0, #512] +; CHECK-NO-LD1R-NEXT: mov z0.d, d0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_double_gep_out_of_range_up: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr d0, [x0, #512] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, d0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 64 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -710,12 +1123,25 @@ } define @ld1rd_double_gep_out_of_range_down(ptr %valp) { -; CHECK-LABEL: ld1rd_double_gep_out_of_range_down: -; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_down: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: sub x8, x0, #8 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_down: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldur d0, [x0, #-8] +; CHECK-NO-LD1R-NEXT: mov z0.d, d0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: ld1rd_double_gep_out_of_range_down: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldur d0, [x0, #-8] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, d0 +; CHECK-NEOVERSE-V1-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 -1 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -990,37 +1416,89 @@ ret %res } define @dup_ld1rh_half_passthruundef_nxv8f16( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, p0/m, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half %ld) ret %res } define @dup_ld1rs_float_passthruundef_nxv4f32( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, p0/m, s0 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f32( undef, %pg, float %ld) ret %res } define @dup_ld1rd_double_passthruundef_nxv2f64( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr d0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.d, p0/m, d0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr d0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, p0/m, d0 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load double, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg, double %ld) ret %res } define @dup_ld1rh_half_passthruundef_nxv4f16( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h0, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h0 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h0, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, p0/m, h0 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f16( undef, %pg, half %ld) ret %res @@ -1062,55 +1540,145 @@ ret %res } define @dup_ld1rh_half_passthruzero_nxv8f16( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8f16( zeroinitializer, %pg, half %ld) ret %res } define @dup_ld1rs_float_passthruzero_nxv4f32( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f32( zeroinitializer, %pg, float %ld) ret %res } define @dup_ld1rd_double_passthruzero_nxv2f64( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr d1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.d, p0/m, d1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr d1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load double, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f64( zeroinitializer, %pg, double %ld) ret %res } define @dup_ld1rh_half_passthruzero_nxv4f16( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f16( zeroinitializer, %pg, half %ld) ret %res } define @dup_ld1rh_half_passthruzero_nxv2f16( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr h1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr h1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f16( zeroinitializer, %pg, half %ld) ret %res } define @dup_ld1rs_float_passthruzero_nxv2f32( %pg, ptr %addr) { -; CHECK-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] -; CHECK-NEXT: ret +; +; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldr s1, [x0] +; CHECK-NO-LD1R-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s1 +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldr s1, [x0] +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEOVERSE-V1-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEOVERSE-V1-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f32( zeroinitializer, %pg, float %ld) ret %res @@ -1209,14 +1777,32 @@ ; Check that a load consumed by a scalable splat prefers a replicating ; load over a pre-indexed load. define i8* @avoid_preindex_load_dup(i8* %src, %pg, * %out) { -; CHECK-LABEL: avoid_preindex_load_dup: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x1] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: avoid_preindex_load_dup: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #1 +; CHECK-LD1R-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-LD1R-NEXT: mov x0, x8 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: avoid_preindex_load_dup: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #1 +; CHECK-NO-LD1R-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NO-LD1R-NEXT: mov x0, x8 +; CHECK-NO-LD1R-NEXT: ptrue p0.d +; CHECK-NO-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: avoid_preindex_load_dup: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #1 +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.d +; CHECK-NEOVERSE-V1-NEXT: mov x0, x8 +; CHECK-NEOVERSE-V1-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEOVERSE-V1-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1227,14 +1813,32 @@ ; Same as avoid_preindex_load_dup, but with zero passthru. define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, %pg, * %out) { -; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x1] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: avoid_preindex_load_dup_passthru_zero: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: add x8, x0, #1 +; CHECK-LD1R-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-LD1R-NEXT: mov x0, x8 +; CHECK-LD1R-NEXT: ptrue p0.d +; CHECK-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: avoid_preindex_load_dup_passthru_zero: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: add x8, x0, #1 +; CHECK-NO-LD1R-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NO-LD1R-NEXT: mov x0, x8 +; CHECK-NO-LD1R-NEXT: ptrue p0.d +; CHECK-NO-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: avoid_preindex_load_dup_passthru_zero: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEOVERSE-V1-NEXT: add x8, x0, #1 +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.d +; CHECK-NEOVERSE-V1-NEXT: mov x0, x8 +; CHECK-NEOVERSE-V1-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEOVERSE-V1-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1245,13 +1849,29 @@ ; If a dup has a non-undef passthru, stick with the pre-indexed load. define i8* @preindex_load_dup_passthru( %passthru, i8* %src, %pg, * %out) { -; CHECK-LABEL: preindex_load_dup_passthru: -; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb x8, [x0, #1]! -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov z0.d, p0/m, x8 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] -; CHECK-NEXT: ret +; CHECK-LD1R-LABEL: preindex_load_dup_passthru: +; CHECK-LD1R: // %bb.0: +; CHECK-LD1R-NEXT: ldrsb x8, [x0, #1]! +; CHECK-LD1R-NEXT: ptrue p1.d +; CHECK-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-LD1R-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-LD1R-NEXT: ret +; +; CHECK-NO-LD1R-LABEL: preindex_load_dup_passthru: +; CHECK-NO-LD1R: // %bb.0: +; CHECK-NO-LD1R-NEXT: ldrsb x8, [x0, #1]! +; CHECK-NO-LD1R-NEXT: ptrue p1.d +; CHECK-NO-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-NO-LD1R-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NO-LD1R-NEXT: ret +; +; CHECK-NEOVERSE-V1-LABEL: preindex_load_dup_passthru: +; CHECK-NEOVERSE-V1: // %bb.0: +; CHECK-NEOVERSE-V1-NEXT: ldrsb x8, [x0, #1]! +; CHECK-NEOVERSE-V1-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEOVERSE-V1-NEXT: ptrue p0.d +; CHECK-NEOVERSE-V1-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEOVERSE-V1-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64