Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -705,6 +705,9 @@ def vector_extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTypeProfile<1, 2, [SDTCisInt<2>, SDTCisVec<1>, SDTCisVec<0>]>, []>; +def vector_insert_subvec : SDNode<"ISD::INSERT_SUBVECTOR", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVec<2>, SDTCisInt<3>]>, + []>; // This operator does subvector type checking. def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>; Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -857,6 +857,16 @@ defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>; defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>; + let AddedComplexity = 1 in { + class LD1RQPat : + Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), + (load_instr (ptrue 31), GPR64sp:$Xn, 0)>; + } + def : LD1RQPat; + def : LD1RQPat; + def : LD1RQPat; + def : LD1RQPat; + // continuous load with reg+reg addressing. defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>; Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -580,103 +580,6 @@ %out = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %a, i64 4) ret %out } - -define dso_local @dupq_ld1rqd_f64() { -; CHECK-LABEL: dupq_ld1rqd_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI49_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI49_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> , i64 0) - %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqw_f32() { -; CHECK-LABEL: dupq_ld1rqw_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI50_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI50_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> , i64 0) - %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqh_f16() { -; CHECK-LABEL: dupq_ld1rqh_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI51_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI51_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> , i64 0) - %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqh_bf16() #0 { -; CHECK-LABEL: dupq_ld1rqh_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI52_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI52_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> , i64 0) - %2 = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqd_i64() { -; CHECK-LABEL: dupq_ld1rqd_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI53_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI53_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> , i64 0) - %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqd_i32() { -; CHECK-LABEL: dupq_ld1rqd_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI54_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI54_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> , i64 0) - %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqd_i16() { -; CHECK-LABEL: dupq_ld1rqd_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI55_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI55_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> , i64 0) - %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) - ret %2 -} - -define dso_local @dupq_ld1rqd_i8() { -; CHECK-LABEL: dupq_ld1rqd_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI56_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI56_0] -; CHECK-NEXT: mov z0.q, q0 -; CHECK-NEXT: ret - %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> , i64 0) - %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0) - ret %2 -} - ; ; EXT ; Index: llvm/test/CodeGen/AArch64/sve-ld1r.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -722,3 +722,119 @@ %shf = shufflevector %ins, undef, zeroinitializer ret %shf } + +define @dupq_ld1rqd_f64(<2 x double>* %a) { +; CHECK-LABEL: dupq_ld1rqd_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ret + %1 = load <2 x double>, <2 x double>* %a + %2 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %1, i64 0) + %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqw_f32(<4 x float>* %a) { +; CHECK-LABEL: dupq_ld1rqw_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ret + %1 = load <4 x float>, <4 x float>* %a + %2 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %1, i64 0) + %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqh_f16(<8 x half>* %a) { +; CHECK-LABEL: dupq_ld1rqh_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ret + %1 = load <8 x half>, <8 x half>* %a + %2 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %1, i64 0) + %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqh_bf16(<8 x bfloat>* %a) #0 { +; CHECK-LABEL: dupq_ld1rqh_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ret + %1 = load <8 x bfloat>, <8 x bfloat>* %a + %2 = tail call fast @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %1, i64 0) + %3 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8bf16( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqd_i64(<2 x i64>* %a) #0 { +; CHECK-LABEL: dupq_ld1rqd_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %1 = load <2 x i64>, <2 x i64>* %a + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %1, i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqw_i32(<4 x i32>* %a) #0 { +; CHECK-LABEL: dupq_ld1rqw_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %1 = load <4 x i32>, <4 x i32>* %a + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %1, i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqw_i16(<8 x i16>* %a) #0 { +; CHECK-LABEL: dupq_ld1rqw_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %1 = load <8 x i16>, <8 x i16>* %a + %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %1, i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2, i64 0) + ret %3 +} + +define @dupq_ld1rqw_i8(<16 x i8>* %a) #0 { +; CHECK-LABEL: dupq_ld1rqw_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %1 = load <16 x i8>, <16 x i8>* %a + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %1, i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2, i64 0) + ret %3 +} + +declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8bf16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2f64(, i64) + +declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) + +attributes #0 = { "target-features"="+sve,+bf16" }