Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -705,6 +705,9 @@ def vector_extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTypeProfile<1, 2, [SDTCisInt<2>, SDTCisVec<1>, SDTCisVec<0>]>, []>; +def vector_insert_subvec : SDNode<"ISD::INSERT_SUBVECTOR", + SDTypeProfile<1, 3, [SDTCisInt<3>, SDTCisVec<2>, SDTCisVec<1>, SDTCisVec<0>]>, + []>; // This operator does subvector type checking. def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19174,6 +19174,40 @@ return SDValue(); } +// Loading floating point literals from the constant-pool results in bitcasts +// to floats from integer loads. Instead, the whole duplane128 intrinsic can +// be treated as a v2i64 load as 128 bits are always loaded as integers, +// and the bitcast can be pushed after the duplane128. +// Treating all 128 bit combinations (e.g. v4i32) of types as v2i64 results in +// simpler pattern matching for Instruction Selection to LD1RQD +static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (VT == MVT::nxv2i64) + return SDValue(); + + SDLoc DL(N); + + SDValue Insert = N->getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + SDValue Load = Insert.getOperand(1); + if (Load.getOpcode() == ISD::BITCAST) + Load = Load.getOperand(0); + if (Load.getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *LD = cast(Load); + + SDValue NewLD = DAG.getLoad(MVT::v2i64, DL, LD->getChain(), LD->getBasePtr(), + LD->getMemOperand()); + SDValue NewInsert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::nxv2i64, + DAG.getUNDEF(MVT::nxv2i64), NewLD, Insert->getOperand(2)); + SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, MVT::nxv2i64, + NewInsert, N->getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); +} + static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, bool fixedSVEVectorVT) { @@ -19276,6 +19310,8 @@ return performSVESpliceCombine(N, DAG); case ISD::FP_EXTEND: return performFPExtendCombine(N, DAG, DCI, Subtarget); + case AArch64ISD::DUPLANE128: + return performDupLane128Combine(N, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -857,6 +857,9 @@ defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>; defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>; + def : Pat<(nxv2i64 (AArch64duplane128 (nxv2i64 (vector_insert_subvec (nxv2i64 undef), (v2i64 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), + (LD1RQ_D_IMM (PTRUE_D 31), GPR64sp:$Xn, 0)>; + // continuous load with reg+reg addressing. defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>; Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -585,44 +585,48 @@ ; CHECK-LABEL: dupq_ld1rqd_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI49_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI49_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI49_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqw_f32() { -; CHECK-LABEL: dupq_ld1rqw_f32: +define dso_local @dupq_ld1rqd_f32() { +; CHECK-LABEL: dupq_ld1rqd_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI50_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI50_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI50_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqh_f16() { -; CHECK-LABEL: dupq_ld1rqh_f16: +define dso_local @dupq_ld1rqd_f16() { +; CHECK-LABEL: dupq_ld1rqd_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI51_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI51_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI51_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqh_bf16() #0 { -; CHECK-LABEL: dupq_ld1rqh_bf16: +define dso_local @dupq_ld1rqd_bf16() #0 { +; CHECK-LABEL: dupq_ld1rqd_bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI52_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI52_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI52_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> , i64 0) %2 = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) @@ -633,8 +637,9 @@ ; CHECK-LABEL: dupq_ld1rqd_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI53_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI53_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI53_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) @@ -645,8 +650,9 @@ ; CHECK-LABEL: dupq_ld1rqd_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI54_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI54_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI54_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) @@ -657,8 +663,9 @@ ; CHECK-LABEL: dupq_ld1rqd_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI55_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI55_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI55_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) @@ -669,8 +676,9 @@ ; CHECK-LABEL: dupq_ld1rqd_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI56_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI56_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI56_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0)