diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -969,6 +969,10 @@ let IsLoad = true; let IsZeroExtLoad = true; } +def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ + auto Type = cast(N)->getExtensionType(); + return Type == ISD::ZEXTLOAD || Type == ISD::EXTLOAD; +}]>; def extloadi1 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; @@ -1033,6 +1037,19 @@ let MemoryVT = i32; } +def azextloadi1 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i1; +}]>; +def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; +def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; +def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + def extloadvi1 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; let ScalarMemoryVT = i1; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -128,6 +128,21 @@ bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeUnscaled(N, 16, Base, OffImm); } + template + bool SelectAddrModeIndexedUImm(SDValue N, SDValue &Base, SDValue &OffImm) { + bool Found = SelectAddrModeIndexed(N, Size, Base, OffImm); + if (Found) { + if (dyn_cast(Base)) + return false; + if (auto CI = dyn_cast(OffImm)) { + int64_t C = CI->getSExtValue(); + if (C <= Max) + return true; + } + } + + return false; + } template bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -3127,6 +3127,13 @@ def am_indexed64 : ComplexPattern; def am_indexed128 : ComplexPattern; +// (unsigned immediate) +// Indexed for 8-bit registers. offset is in range [0,63]. +def am_indexed8_6b : ComplexPattern", []>; +def am_indexed16_6b : ComplexPattern", []>; +def am_indexed32_6b : ComplexPattern", []>; +def am_indexed64_6b : ComplexPattern", []>; + def gi_am_indexed8 : GIComplexOperandMatcher">, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1643,6 +1643,45 @@ def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), (PTEST_PP PPR:$pg, PPR:$src)>; + let AddedComplexity = 1 in { + class LD1RPat : + Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))), + (load (ptrue 31), GPR64:$base, $offset)>; + } + + // LDR1 of 8-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 16-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 32-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 64-bit data + def : LD1RPat; + + // LD1R of FP data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + // LD1R of 128-bit masked data def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), (LD1RQ_B_IMM $gp, $base, (i64 0))>; diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -23,10 +23,10 @@ define @test_post_ld1_dup(double* %a, double** %ptr, i64 %inc) { ; CHECK-LABEL: test_post_ld1_dup: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: add x8, x0, x2, lsl #3 ; CHECK-NEXT: str x8, [x1] -; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: ret %load = load double, double* %a %dup = call @llvm.aarch64.sve.dup.x.nxv2f64(double %load) diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -0,0 +1,704 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; +; Check that ldr1* instruction is generated to splat scalar during load, +; rather than mov from scalar to vector register (which would require the vector unit). +; +; one-off: ld1r_stack checks that the compiler doesn't crash on a stack pointer. +; +; Test axes: +; types = [i8, i16, i32, i64, half, float, double] +; methods = [direct load, gep upper bound - 1, gep out of range x {neg,pos}, sext..., zext..., unpacked_floats...] +; + +@g8 = external global i8 + +; Not yet implemented: value being splatted to vector is loaded from the stack. +define @ld1r_stack() { +; CHECK-LABEL: ld1r_stack: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 // =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, :got:g8 +; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8] +; CHECK-NEXT: ldrb w8, [x8] +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: ldrb w8, [sp, #14] +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ret + %valp = alloca i8 + %valp2 = load volatile i8, i8* @g8 + store volatile i8 %valp2, i8* %valp + %valp3 = getelementptr i8, i8* %valp, i32 2 + %val = load i8, i8* %valp3 + %1 = insertelement undef, i8 %val, i32 0 + %2 = shufflevector %1, undef, zeroinitializer + ret %2 +} + +define @ld1rb(i8* %valp) { +; CHECK-LABEL: ld1rb: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ins = insertelement undef, i8 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_gep(i8* %valp) { +; CHECK-LABEL: ld1rb_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0, #63] +; CHECK-NEXT: ret + %valp2 = getelementptr i8, i8* %valp, i32 63 + %val = load i8, i8* %valp2 + %ins = insertelement undef, i8 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_gep_out_of_range_up(i8* %valp) { +; CHECK-LABEL: ld1rb_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #64] +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i8, i8* %valp, i32 64 + %val = load i8, i8* %valp2 + %ins = insertelement undef, i8 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_gep_out_of_range_down(i8* %valp) { +; CHECK-LABEL: ld1rb_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurb w8, [x0, #-1] +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i8, i8* %valp, i32 -1 + %val = load i8, i8* %valp2 + %ins = insertelement undef, i8 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i16_zext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = zext i8 %val to i16 + %ins = insertelement undef, i16 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i16_sext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = sext i8 %val to i16 + %ins = insertelement undef, i16 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i32_zext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = zext i8 %val to i32 + %ins = insertelement undef, i32 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i32_sext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = sext i8 %val to i32 + %ins = insertelement undef, i32 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i64_zext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = zext i8 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rb_i8_i64_sext(i8* %valp) { +; CHECK-LABEL: ld1rb_i8_i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i8, i8* %valp + %ext = sext i8 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh(i16* %valp) { +; CHECK-LABEL: ld1rh: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ins = insertelement undef, i16 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_gep(i16* %valp) { +; CHECK-LABEL: ld1rh_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %valp2 = getelementptr i16, i16* %valp, i32 63 + %val = load i16, i16* %valp2 + %ins = insertelement undef, i16 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_gep_out_of_range_up(i16* %valp) { +; CHECK-LABEL: ld1rh_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0, #128] +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i16, i16* %valp, i32 64 + %val = load i16, i16* %valp2 + %ins = insertelement undef, i16 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_gep_out_of_range_down(i16* %valp) { +; CHECK-LABEL: ld1rh_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #-2] +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i16, i16* %valp, i32 -1 + %val = load i16, i16* %valp2 + %ins = insertelement undef, i16 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_i16_i32_zext(i16* %valp) { +; CHECK-LABEL: ld1rh_i16_i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ext = zext i16 %val to i32 + %ins = insertelement undef, i32 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_i16_i32_sext(i16* %valp) { +; CHECK-LABEL: ld1rh_i16_i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ext = sext i16 %val to i32 + %ins = insertelement undef, i32 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_i16_i64_zext(i16* %valp) { +; CHECK-LABEL: ld1rh_i16_i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ext = zext i16 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_i16_i64_sext(i16* %valp) { +; CHECK-LABEL: ld1rh_i16_i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i16, i16* %valp + %ext = sext i16 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw(i32* %valp) { +; CHECK-LABEL: ld1rw: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i32, i32* %valp + %ins = insertelement undef, i32 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_gep(i32* %valp) { +; CHECK-LABEL: ld1rw_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] +; CHECK-NEXT: ret + %valp2 = getelementptr i32, i32* %valp, i32 63 + %val = load i32, i32* %valp2 + %ins = insertelement undef, i32 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_gep_out_of_range_up(i32* %valp) { +; CHECK-LABEL: ld1rw_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0, #256] +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i32, i32* %valp, i32 64 + %val = load i32, i32* %valp2 + %ins = insertelement undef, i32 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_gep_out_of_range_down(i32* %valp) { +; CHECK-LABEL: ld1rw_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #-4] +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: ret + %valp2 = getelementptr i32, i32* %valp, i32 -1 + %val = load i32, i32* %valp2 + %ins = insertelement undef, i32 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_i32_i64_zext(i32* %valp) { +; CHECK-LABEL: ld1rw_i32_i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i32, i32* %valp + %ext = zext i32 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_i32_i64_sext(i32* %valp) { +; CHECK-LABEL: ld1rw_i32_i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i32, i32* %valp + %ext = sext i32 %val to i64 + %ins = insertelement undef, i64 %ext, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd(i64* %valp) { +; CHECK-LABEL: ld1rd: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load i64, i64* %valp + %ins = insertelement undef, i64 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_gep(i64* %valp) { +; CHECK-LABEL: ld1rd_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] +; CHECK-NEXT: ret + %valp2 = getelementptr i64, i64* %valp, i32 63 + %val = load i64, i64* %valp2 + %ins = insertelement undef, i64 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_gep_out_of_range_up(i64* %valp) { +; CHECK-LABEL: ld1rd_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0, #512] +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: ret + %valp2 = getelementptr i64, i64* %valp, i32 64 + %val = load i64, i64* %valp2 + %ins = insertelement undef, i64 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_gep_out_of_range_down(i64* %valp) { +; CHECK-LABEL: ld1rd_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur x8, [x0, #-8] +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: ret + %valp2 = getelementptr i64, i64* %valp, i32 -1 + %val = load i64, i64* %valp2 + %ins = insertelement undef, i64 %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half(half* %valp) { +; CHECK-LABEL: ld1rh_half: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load half, half* %valp + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_gep(half* %valp) { +; CHECK-LABEL: ld1rh_half_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 63 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_gep_out_of_range_up(half* %valp) { +; CHECK-LABEL: ld1rh_half_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 64 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_gep_out_of_range_down(half* %valp) { +; CHECK-LABEL: ld1rh_half_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-2] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 -1 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked4(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked4: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load half, half* %valp + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked4_gep(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked4_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 63 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked4_gep_out_of_range_up(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 64 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked4_gep_out_of_range_down(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-2] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 -1 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked2(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load half, half* %valp + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked2_gep(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked2_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0, #126] +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 63 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked2_gep_out_of_range_up(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 64 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rh_half_unpacked2_gep_out_of_range_down(half* %valp) { +; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-2] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %valp2 = getelementptr half, half* %valp, i32 -1 + %val = load half, half* %valp2 + %ins = insertelement undef, half %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float(float* %valp) { +; CHECK-LABEL: ld1rw_float: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load float, float* %valp + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_gep(float* %valp) { +; CHECK-LABEL: ld1rw_float_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252] +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 63 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_gep_out_of_range_up(float* %valp) { +; CHECK-LABEL: ld1rw_float_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, #256] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 64 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_gep_out_of_range_down(float* %valp) { +; CHECK-LABEL: ld1rw_float_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #-4] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 -1 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_unpacked2(float* %valp) { +; CHECK-LABEL: ld1rw_float_unpacked2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load float, float* %valp + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_unpacked2_gep(float* %valp) { +; CHECK-LABEL: ld1rw_float_unpacked2_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0, #252] +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 63 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_unpacked2_gep_out_of_range_up(float* %valp) { +; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, #256] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 64 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rw_float_unpacked2_gep_out_of_range_down(float* %valp) { +; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #-4] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %valp2 = getelementptr float, float* %valp, i32 -1 + %val = load float, float* %valp2 + %ins = insertelement undef, float %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_double(double* %valp) { +; CHECK-LABEL: ld1rd_double: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %val = load double, double* %valp + %ins = insertelement undef, double %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_double_gep(double* %valp) { +; CHECK-LABEL: ld1rd_double_gep: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504] +; CHECK-NEXT: ret + %valp2 = getelementptr double, double* %valp, i32 63 + %val = load double, double* %valp2 + %ins = insertelement undef, double %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_double_gep_out_of_range_up(double* %valp) { +; CHECK-LABEL: ld1rd_double_gep_out_of_range_up: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #512] +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ret + %valp2 = getelementptr double, double* %valp, i32 64 + %val = load double, double* %valp2 + %ins = insertelement undef, double %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +} + +define @ld1rd_double_gep_out_of_range_down(double* %valp) { +; CHECK-LABEL: ld1rd_double_gep_out_of_range_down: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #-8] +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ret + %valp2 = getelementptr double, double* %valp, i32 -1 + %val = load double, double* %valp2 + %ins = insertelement undef, double %val, i32 0 + %shf = shufflevector %ins, undef, zeroinitializer + ret %shf +}