Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -705,6 +705,9 @@ def vector_extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTypeProfile<1, 2, [SDTCisInt<2>, SDTCisVec<1>, SDTCisVec<0>]>, []>; +def vector_insert_subvec : SDNode<"ISD::INSERT_SUBVECTOR", + SDTypeProfile<1, 3, [SDTCisInt<3>, SDTCisVec<2>, SDTCisVec<1>, SDTCisVec<0>]>, + []>; // This operator does subvector type checking. def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19174,6 +19174,46 @@ return SDValue(); } +// Loading floating point literals from the constant-pool results in bitcasts +// to floats from integer loads. Instead, the whole duplane128 intrinsic can +// be treated as a v2i64 load as 128 bits are always loaded as integers, +// and the bitcast can be pushed after the duplane128. +// Treating all 128 bit combinations (e.g. v4i32) of types as v2i64 results in +// simpler pattern matching for Instruction Selection to LD1RQD +static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + EVT NewVT; + if (VT == MVT::nxv2f64) + NewVT = MVT::nxv2i64; + else if (VT == MVT::nxv4f32) + NewVT = MVT::nxv4i32; + else if (VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) + NewVT = MVT::nxv8i16; + else + return SDValue(); + + SDLoc DL(N); + + SDValue Insert = N->getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + SDValue Bitcast = Insert.getOperand(1); + if (Bitcast.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue Load = Bitcast.getOperand(0); + if (Load.getOpcode() != ISD::LOAD) + return SDValue(); + + SDValue NewInsert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, DAG.getUNDEF(NewVT), Load, + Insert->getOperand(2)); + SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewVT, + NewInsert, N->getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); +} + static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, bool fixedSVEVectorVT) { @@ -19276,6 +19316,8 @@ return performSVESpliceCombine(N, DAG); case ISD::FP_EXTEND: return performFPExtendCombine(N, DAG, DCI, Subtarget); + case AArch64ISD::DUPLANE128: + return performDupLane128Combine(N, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -848,10 +848,10 @@ defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>; // LD1RQ loads (load quadword-vector and splat to scalable vector) - defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>; - defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>; - defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>; - defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>; + defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8, nxv16i8, v16i8, PTRUE_B>; + defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16, nxv8i16, v8i16, PTRUE_H>; + defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32, nxv4i32, v4i32, PTRUE_S>; + defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64, nxv2i64, v2i64, PTRUE_D>; defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>; defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>; Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6897,7 +6897,7 @@ } multiclass sve_mem_ldqr_si sz, string asm, RegisterOperand listty, - ZPRRegOp zprty> { + ZPRRegOp zprty, ValueType vt1, ValueType vt2, sve_int_ptrue pred> { def NAME : sve_mem_ldqr_si; def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; @@ -6905,6 +6905,9 @@ (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>; + + def : Pat<(vt1 (AArch64duplane128 (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), + (!cast(NAME) (pred 31), GPR64sp:$Xn, 0)>; } class sve_mem_ldqr_ss sz, string asm, RegisterOperand VecList, Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -585,8 +585,9 @@ ; CHECK-LABEL: dupq_ld1rqd_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI49_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI49_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI49_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) @@ -597,8 +598,9 @@ ; CHECK-LABEL: dupq_ld1rqw_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI50_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI50_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI50_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) @@ -609,8 +611,9 @@ ; CHECK-LABEL: dupq_ld1rqh_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI51_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI51_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI51_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) @@ -621,8 +624,9 @@ ; CHECK-LABEL: dupq_ld1rqh_bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI52_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI52_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI52_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> , i64 0) %2 = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) @@ -633,44 +637,48 @@ ; CHECK-LABEL: dupq_ld1rqd_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI53_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI53_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI53_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i32() { -; CHECK-LABEL: dupq_ld1rqd_i32: +define dso_local @dupq_ld1rqw_i32() { +; CHECK-LABEL: dupq_ld1rqw_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI54_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI54_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI54_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i16() { -; CHECK-LABEL: dupq_ld1rqd_i16: +define dso_local @dupq_ld1rqh_i16() { +; CHECK-LABEL: dupq_ld1rqh_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI55_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI55_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI55_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i8() { -; CHECK-LABEL: dupq_ld1rqd_i8: +define dso_local @dupq_ld1rqb_i8() { +; CHECK-LABEL: dupq_ld1rqb_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI56_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI56_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI56_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0)