Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10475,14 +10475,58 @@ DAG.getConstant(0, DL, MVT::i64), SplatVal); } +static SDValue tryLowerLD1RQ(SDValue Op, SelectionDAG &DAG, SDLoc &DL, + EVT &VT) { + SDValue Op1 = Op.getOperand(1); + if (Op1.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + // Bail on BUILD_VECTOR, come back later once vector constant is available + // in constant pool and its load result can be used in LD1RQ + if (Op1.getOperand(1).getOpcode() == ISD::BUILD_VECTOR) + return Op1.getOperand(1); + + SDValue Load = Op1.getOperand(1); + if (Load.getOpcode() == ISD::BITCAST) + Load = Load.getOperand(0); + if (Load.getOpcode() != ISD::LOAD) + return SDValue(); + + int Opcode; + EVT VecElTy = VT.getVectorElementType(); + if (VecElTy == MVT::f64 || VecElTy == MVT::i64) + Opcode = AArch64::LD1RQ_D_IMM; + else if (VecElTy == MVT::f32 || VecElTy == MVT::i32) + Opcode = AArch64::LD1RQ_W_IMM; + else if (VecElTy == MVT::f16 || VecElTy == MVT::i16 || VecElTy == MVT::bf16) + Opcode = AArch64::LD1RQ_H_IMM; + else if (VecElTy == MVT::i8) + Opcode = AArch64::LD1RQ_B_IMM; + else + return SDValue(); + + SDValue Label = Load.getOperand(1); + SDNode *LD1RQ = DAG.getMachineNode( + Opcode, DL, Op1.getValueType(), + getPTrue(DAG, DL, MVT::nxv16i1, AArch64SVEPredPattern::all), Label, + DAG.getTargetConstant(0, DL, MVT::i64)); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(LD1RQ, 0)); +} + SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - EVT VT = Op.getValueType(); if (!isTypeLegal(VT) || !VT.isScalableVector()) return SDValue(); + SDValue LD1RQResult = tryLowerLD1RQ(Op, DAG, DL, VT); + if (LD1RQResult) { + if (LD1RQResult.getOpcode() == ISD::BUILD_VECTOR) + return SDValue(); + return LD1RQResult; + } + // Current lowering only supports the SVE-ACLE types. if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) return SDValue(); Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -535,8 +535,9 @@ ; CHECK-LABEL: dupq_ld1rqd_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI49_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI49_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI49_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.experimental.vector.insert.nxv2f64.v2f64( undef, <2 x double> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) @@ -547,8 +548,9 @@ ; CHECK-LABEL: dupq_ld1rqw_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI50_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI50_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI50_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.experimental.vector.insert.nxv4f32.v4f32( undef, <4 x float> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) @@ -559,8 +561,9 @@ ; CHECK-LABEL: dupq_ld1rqh_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI51_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI51_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI51_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call fast @llvm.experimental.vector.insert.nxv8f16.v8f16( undef, <8 x half> , i64 0) %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) @@ -571,8 +574,9 @@ ; CHECK-LABEL: dupq_ld1rqh_bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI52_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI52_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI52_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> , i64 0) %2 = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) @@ -583,44 +587,48 @@ ; CHECK-LABEL: dupq_ld1rqd_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI53_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI53_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI53_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i32() { -; CHECK-LABEL: dupq_ld1rqd_i32: +define dso_local @dupq_ld1rqw_i32() { +; CHECK-LABEL: dupq_ld1rqw_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI54_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI54_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI54_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, <4 x i32> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i16() { -; CHECK-LABEL: dupq_ld1rqd_i16: +define dso_local @dupq_ld1rqh_i16() { +; CHECK-LABEL: dupq_ld1rqh_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI55_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI55_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI55_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, <8 x i16> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) ret %2 } -define dso_local @dupq_ld1rqd_i8() { -; CHECK-LABEL: dupq_ld1rqd_i8: +define dso_local @dupq_ld1rqb_i8() { +; CHECK-LABEL: dupq_ld1rqb_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI56_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI56_0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI56_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] ; CHECK-NEXT: ret %1 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, <16 x i8> , i64 0) %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0)