diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -148,6 +148,9 @@ def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; +def FeatureAvoidLD1R : SubtargetFeature<"avoid-ld1r", + "AvoidLD1R", "true", "Prefer LDR(LDP)+MOV ove LD1RX">; + def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)", [FeatureSVE, FeatureUseScalarIncVL]>; @@ -1137,7 +1140,8 @@ FeatureLSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, - FeaturePredictableSelectIsExpensive]>; + FeaturePredictableSelectIsExpensive, + FeatureAvoidLD1R]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", "Neoverse V2 ARM processors", [ diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -262,6 +262,8 @@ def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; +def UseLD1R : Predicate<"!Subtarget->avoidLD1R()">; + def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2331,37 +2331,39 @@ } } - // LDR1 of 8-bit data - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - - // LDR1 of 16-bit data - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - - // LDR1 of 32-bit data - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - - // LDR1 of 64-bit data - defm : LD1RPat; - - // LD1R of FP data - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; - defm : LD1RPat; + let Predicates = [UseLD1R] in { + // LDR1 of 8-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LDR1 of 16-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LDR1 of 32-bit data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + + // LDR1 of 64-bit data + defm : LD1RPat; + + // LD1R of FP data + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + defm : LD1RPat; + } // LD1R of 128-bit masked data multiclass ld1rq_pat{ diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7203,6 +7203,10 @@ def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)), (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv4f16 (op nxv4i1:$pg, f16:$splat, nxv4f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv2f16 (op nxv2i1:$pg, f16:$splat, nxv2f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)), (!cast(NAME # _S) $passthru, $pg, $splat)>; def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)), diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+avoid-ld1r < %s | FileCheck %s --check-prefixes=CHECK-AVOID-LD1R ; ; Check that ldr1* instruction is generated to splat scalar during load, ; rather than mov from scalar to vector register (which would require the vector unit). @@ -27,6 +28,19 @@ ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [sp, #14] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1r_stack: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: sub sp, sp, #16 +; CHECK-AVOID-LD1R-NEXT: .cfi_def_cfa_offset 16 +; CHECK-AVOID-LD1R-NEXT: adrp x8, :got:g8 +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x8, :got_lo12:g8] +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x8] +; CHECK-AVOID-LD1R-NEXT: strb w8, [sp, #12] +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [sp, #14] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, w8 +; CHECK-AVOID-LD1R-NEXT: add sp, sp, #16 +; CHECK-AVOID-LD1R-NEXT: ret %valp = alloca i8 %valp2 = load volatile i8, ptr @g8 store volatile i8 %valp2, ptr %valp @@ -43,6 +57,12 @@ ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ins = insertelement undef, i8 %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -55,6 +75,12 @@ ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0, #63] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0, #63] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i8, ptr %valp, i32 63 %val = load i8, ptr %valp2 %ins = insertelement undef, i8 %val, i32 0 @@ -69,6 +95,12 @@ ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0, #64] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i8, ptr %valp, i32 64 %val = load i8, ptr %valp2 %ins = insertelement undef, i8 %val, i32 0 @@ -83,6 +115,12 @@ ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldurb w8, [x0, #-1] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i8, ptr %valp, i32 -1 %val = load i8, ptr %valp2 %ins = insertelement undef, i8 %val, i32 0 @@ -96,6 +134,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i16_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = zext i8 %val to i16 %ins = insertelement undef, i16 %ext, i32 0 @@ -109,6 +153,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i16_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = sext i8 %val to i16 %ins = insertelement undef, i16 %ext, i32 0 @@ -122,6 +172,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i32_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = zext i8 %val to i32 %ins = insertelement undef, i32 %ext, i32 0 @@ -135,6 +191,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i32_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = sext i8 %val to i32 %ins = insertelement undef, i32 %ext, i32 0 @@ -148,6 +210,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = zext i8 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -161,6 +229,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rb_i8_i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i8, ptr %valp %ext = sext i8 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -174,6 +248,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i16, ptr %valp %ins = insertelement undef, i16 %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -186,6 +266,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0, #126] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i16, ptr %valp, i32 63 %val = load i16, ptr %valp2 %ins = insertelement undef, i16 %val, i32 0 @@ -200,6 +286,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0, #128] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i16, ptr %valp, i32 64 %val = load i16, ptr %valp2 %ins = insertelement undef, i16 %val, i32 0 @@ -214,6 +306,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldurh w8, [x0, #-2] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i16, ptr %valp, i32 -1 %val = load i16, ptr %valp2 %ins = insertelement undef, i16 %val, i32 0 @@ -227,6 +325,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_i16_i32_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i16, ptr %valp %ext = zext i16 %val to i32 %ins = insertelement undef, i32 %ext, i32 0 @@ -240,6 +344,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_i16_i32_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i16, ptr %valp %ext = sext i16 %val to i32 %ins = insertelement undef, i32 %ext, i32 0 @@ -253,6 +363,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_i16_i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i16, ptr %valp %ext = zext i16 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -266,6 +382,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_i16_i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsh x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i16, ptr %valp %ext = sext i16 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -279,6 +401,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i32, ptr %valp %ins = insertelement undef, i32 %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -291,6 +419,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0, #252] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i32, ptr %valp, i32 63 %val = load i32, ptr %valp2 %ins = insertelement undef, i32 %val, i32 0 @@ -305,6 +439,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0, #256] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i32, ptr %valp, i32 64 %val = load i32, ptr %valp2 %ins = insertelement undef, i32 %val, i32 0 @@ -319,6 +459,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur w8, [x0, #-4] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, w8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i32, ptr %valp, i32 -1 %val = load i32, ptr %valp2 %ins = insertelement undef, i32 %val, i32 0 @@ -332,6 +478,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_i32_i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i32, ptr %valp %ext = zext i32 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -345,6 +497,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_i32_i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsw x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i32, ptr %valp %ext = sext i32 %val to i64 %ins = insertelement undef, i64 %ext, i32 0 @@ -358,6 +516,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %val = load i64, ptr %valp %ins = insertelement undef, i64 %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -370,6 +534,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0, #504] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i64, ptr %valp, i32 63 %val = load i64, ptr %valp2 %ins = insertelement undef, i64 %val, i32 0 @@ -384,6 +554,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0, #512] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i64, ptr %valp, i32 64 %val = load i64, ptr %valp2 %ins = insertelement undef, i64 %val, i32 0 @@ -398,6 +574,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur x8, [x0, #-8] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr i64, ptr %valp, i32 -1 %val = load i64, ptr %valp2 %ins = insertelement undef, i64 %val, i32 0 @@ -411,6 +593,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -423,6 +611,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -437,6 +631,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -451,6 +651,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -464,6 +670,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked4: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -476,6 +688,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0, #126] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked4_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -490,6 +708,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -504,6 +728,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -517,6 +747,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked2: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load half, ptr %valp %ins = insertelement undef, half %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -529,6 +765,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0, #126] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked2_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #126] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 63 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -543,6 +785,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0, #128] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 64 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -557,6 +805,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur h0, [x0, #-2] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, h0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr half, ptr %valp, i32 -1 %val = load half, ptr %valp2 %ins = insertelement undef, half %val, i32 0 @@ -570,6 +824,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load float, ptr %valp %ins = insertelement undef, float %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -582,6 +842,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0, #252] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 63 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -596,6 +862,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0, #256] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 64 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -610,6 +882,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur s0, [x0, #-4] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 -1 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -623,6 +901,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_unpacked2: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load float, ptr %valp %ins = insertelement undef, float %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -635,6 +919,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0, #252] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_unpacked2_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0, #252] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 63 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -649,6 +939,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0, #256] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 64 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -663,6 +959,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur s0, [x0, #-4] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, s0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr float, ptr %valp, i32 -1 %val = load float, ptr %valp2 %ins = insertelement undef, float %val, i32 0 @@ -676,6 +978,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_double: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, d0 +; CHECK-AVOID-LD1R-NEXT: ret %val = load double, ptr %valp %ins = insertelement undef, double %val, i32 0 %shf = shufflevector %ins, undef, zeroinitializer @@ -688,6 +996,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_double_gep: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d0, [x0, #504] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, d0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 63 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -702,6 +1016,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_double_gep_out_of_range_up: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d0, [x0, #512] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, d0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 64 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -716,6 +1036,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: ld1rd_double_gep_out_of_range_down: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldur d0, [x0, #-8] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, d0 +; CHECK-AVOID-LD1R-NEXT: ret %valp2 = getelementptr double, ptr %valp, i32 -1 %val = load double, ptr %valp2 %ins = insertelement undef, double %val, i32 0 @@ -729,6 +1055,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqd_f64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.d +; CHECK-AVOID-LD1R-NEXT: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <2 x double>, ptr %a %2 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %1, i64 0) %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %2, i64 0) @@ -741,6 +1073,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqw_f32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.s +; CHECK-AVOID-LD1R-NEXT: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <4 x float>, ptr %a %2 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %1, i64 0) %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %2, i64 0) @@ -753,6 +1091,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqh_f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.h +; CHECK-AVOID-LD1R-NEXT: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <8 x half>, ptr %a %2 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %1, i64 0) %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %2, i64 0) @@ -765,6 +1109,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqh_bf16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.h +; CHECK-AVOID-LD1R-NEXT: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <8 x bfloat>, ptr %a %2 = tail call fast @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %1, i64 0) %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8bf16( %2, i64 0) @@ -777,6 +1127,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqd_i64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.d +; CHECK-AVOID-LD1R-NEXT: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <2 x i64>, ptr %a %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %1, i64 0) %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2, i64 0) @@ -789,6 +1145,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqw_i32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.s +; CHECK-AVOID-LD1R-NEXT: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <4 x i32>, ptr %a %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %1, i64 0) %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2, i64 0) @@ -801,6 +1163,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqw_i16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.h +; CHECK-AVOID-LD1R-NEXT: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <8 x i16>, ptr %a %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %1, i64 0) %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2, i64 0) @@ -813,6 +1181,12 @@ ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dupq_ld1rqw_i8: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ptrue p0.b +; CHECK-AVOID-LD1R-NEXT: ld1rqb { z0.b }, p0/z, [x0] +; CHECK-AVOID-LD1R-NEXT: ret %1 = load <16 x i8>, ptr %a %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %1, i64 0) %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2, i64 0) @@ -838,6 +1212,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rb_i8_passthruundef_nxv16i8: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv16i8( undef, %pg, i8 %ld) ret %res @@ -847,6 +1227,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_i16_passthruundef_nxv8i16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ld) ret %res @@ -856,6 +1242,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = sext i8 %ld to i16 %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ext) @@ -866,6 +1258,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = zext i8 %ld to i16 %res = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 %ext) @@ -876,6 +1274,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i32_passthruundef_nxv4i32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i32, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ld) ret %res @@ -885,6 +1289,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = sext i8 %ld to i32 %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) @@ -895,6 +1305,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = zext i8 %ld to i32 %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) @@ -905,6 +1321,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %ext = sext i16 %ld to i32 %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) @@ -915,6 +1337,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %ext = zext i16 %ld to i32 %res = call @llvm.aarch64.sve.dup.nxv4i32( undef, %pg, i32 %ext) @@ -925,6 +1353,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rd_i64_passthruundef_nxv2i64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i64, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ld) ret %res @@ -934,6 +1368,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = sext i8 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -944,6 +1384,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %ext = zext i8 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -954,6 +1400,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsh x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %ext = sext i16 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -964,6 +1416,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %ext = zext i16 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -974,6 +1432,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_sext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsw x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i32, ptr %addr %ext = sext i32 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -984,6 +1448,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_zext: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i32, ptr %addr %ext = zext i32 %ld to i64 %res = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg, i64 %ext) @@ -994,6 +1464,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h0 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half %ld) ret %res @@ -1003,6 +1479,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, s0 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f32( undef, %pg, float %ld) ret %res @@ -1012,6 +1494,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, d0 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load double, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg, double %ld) ret %res @@ -1021,6 +1509,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h0, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h0 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f16( undef, %pg, half %ld) ret %res @@ -1030,6 +1524,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rb_i8_passthruzero_nxv16i8: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.b, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv16i8( zeroinitializer, %pg, i8 %ld) ret %res @@ -1039,6 +1540,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_i16_passthruzero_nxv8i16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8i16( zeroinitializer, %pg, i16 %ld) ret %res @@ -1048,6 +1556,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_i32_passthruzero_nxv4i32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i32, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4i32( zeroinitializer, %pg, i32 %ld) ret %res @@ -1057,6 +1572,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rd_i64_passthruzero_nxv2i64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i64, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2i64( zeroinitializer, %pg, i64 %ld) ret %res @@ -1066,6 +1588,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8f16( zeroinitializer, %pg, half %ld) ret %res @@ -1075,6 +1604,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, s1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f32( zeroinitializer, %pg, float %ld) ret %res @@ -1084,6 +1620,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, d1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load double, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f64( zeroinitializer, %pg, double %ld) ret %res @@ -1093,6 +1636,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f16( zeroinitializer, %pg, half %ld) ret %res @@ -1102,6 +1652,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f16( zeroinitializer, %pg, half %ld) ret %res @@ -1111,6 +1668,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, s1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f32( zeroinitializer, %pg, float %ld) ret %res @@ -1121,6 +1685,12 @@ ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rb_i8_passthru_nxv16i8: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrb w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.b, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i8, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv16i8( %pt, %pg, i8 %ld) ret %res @@ -1131,6 +1701,12 @@ ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rh_i16_passthru_nxv8i16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrh w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i16, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8i16( %pt, %pg, i16 %ld) ret %res @@ -1141,6 +1717,12 @@ ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rs_i32_passthru_nxv4i32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr w8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, w8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i32, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4i32( %pt, %pg, i32 %ld) ret %res @@ -1151,6 +1733,12 @@ ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rd_i64_passthru_nxv2i64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr x8, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load i64, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2i64( %pt, %pg, i64 %ld) ret %res @@ -1161,6 +1749,12 @@ ; CHECK-NEXT: ldr h1, [x0] ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rh_half_passthru_nxv8f16: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr h1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.h, p0/m, h1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load half, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv8f16( %pt, %pg, half %ld) ret %res @@ -1171,6 +1765,12 @@ ; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: mov z0.s, p0/m, s1 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rs_float_passthru_nxv4f32: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr s1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.s, p0/m, s1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load float, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv4f32( %pt, %pg, float %ld) ret %res @@ -1181,6 +1781,12 @@ ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: mov z0.d, p0/m, d1 ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: negtest_dup_ld1rd_double_passthru_nxv2f64: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldr d1, [x0] +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, d1 +; CHECK-AVOID-LD1R-NEXT: ret %ld = load double, ptr %addr %res = call @llvm.aarch64.sve.dup.nxv2f64( %pt, %pg, double %ld) ret %res @@ -1197,6 +1803,16 @@ ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: avoid_preindex_load: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: add x8, x0, #1 +; CHECK-AVOID-LD1R-NEXT: ldrsb x9, [x0, #1] +; CHECK-AVOID-LD1R-NEXT: mov x0, x8 +; CHECK-AVOID-LD1R-NEXT: ptrue p0.d +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x9 +; CHECK-AVOID-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-AVOID-LD1R-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1217,6 +1833,16 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: avoid_preindex_load_dup: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: add x8, x0, #1 +; CHECK-AVOID-LD1R-NEXT: ldrsb x9, [x0, #1] +; CHECK-AVOID-LD1R-NEXT: mov x0, x8 +; CHECK-AVOID-LD1R-NEXT: ptrue p1.d +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x9 +; CHECK-AVOID-LD1R-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-AVOID-LD1R-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1235,6 +1861,17 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: avoid_preindex_load_dup_passthru_zero: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: add x8, x0, #1 +; CHECK-AVOID-LD1R-NEXT: ldrsb x9, [x0, #1] +; CHECK-AVOID-LD1R-NEXT: mov x0, x8 +; CHECK-AVOID-LD1R-NEXT: mov z0.d, #0 // =0x0 +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x9 +; CHECK-AVOID-LD1R-NEXT: ptrue p0.d +; CHECK-AVOID-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-AVOID-LD1R-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1252,6 +1889,14 @@ ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: preindex_load_dup_passthru: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb x8, [x0, #1]! +; CHECK-AVOID-LD1R-NEXT: ptrue p1.d +; CHECK-AVOID-LD1R-NEXT: mov z0.d, p0/m, x8 +; CHECK-AVOID-LD1R-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-AVOID-LD1R-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64 @@ -1271,6 +1916,15 @@ ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: str x8, [x2] ; CHECK-NEXT: ret +; +; CHECK-AVOID-LD1R-LABEL: preidx8sext64_instead_of_ld1r: +; CHECK-AVOID-LD1R: // %bb.0: +; CHECK-AVOID-LD1R-NEXT: ldrsb x8, [x0, #1]! +; CHECK-AVOID-LD1R-NEXT: ptrue p0.d +; CHECK-AVOID-LD1R-NEXT: mov z0.d, x8 +; CHECK-AVOID-LD1R-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-AVOID-LD1R-NEXT: str x8, [x2] +; CHECK-AVOID-LD1R-NEXT: ret %ptr = getelementptr inbounds i8, i8* %src, i64 1 %tmp = load i8, i8* %ptr, align 4 %ext = sext i8 %tmp to i64