diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19199,6 +19199,37 @@ return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); } +static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + SDValue Insert = N->getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + if (!Insert.getOperand(0).isUndef()) + return SDValue(); + + auto IdxInsert = cast(Insert.getOperand(2))->getZExtValue(); + auto IdxDupLane = cast(N->getOperand(1))->getZExtValue(); + if (IdxInsert != IdxDupLane) + return SDValue(); + + SDValue Bitcast = Insert.getOperand(1); + SDValue Load = Bitcast.getOperand(0); + EVT LoadVT = Load.getValueType(); + if (!LoadVT.isFixedLengthVector()) + return SDValue(); + EVT NewVT = getPackedSVEVectorVT(Load.getValueType().getVectorElementType()); + + SDLoc DL(N); + SDValue NewInsert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, DAG.getUNDEF(NewVT), Load, + Insert->getOperand(2)); + SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewVT, + NewInsert, N->getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -19285,6 +19316,8 @@ return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: return performDUPCombine(N, DCI); + case AArch64ISD::DUPLANE128: + return performDupLane128Combine(N, DAG); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -726,8 +726,8 @@ define @dupq_ld1rqd_f64_ptr(<2 x double>* %a) { ; CHECK-LABEL: dupq_ld1rqd_f64_ptr: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <2 x double>, <2 x double>* %a %2 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %1, i64 0) @@ -738,8 +738,8 @@ define @dupq_ld1rqw_f32_ptr(<4 x float>* %a) { ; CHECK-LABEL: dupq_ld1rqw_f32_ptr: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %a %2 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %1, i64 0) @@ -750,8 +750,8 @@ define @dupq_ld1rqh_f16_ptr(<8 x half>* %a) { ; CHECK-LABEL: dupq_ld1rqh_f16_ptr: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <8 x half>, <8 x half>* %a %2 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %1, i64 0) @@ -762,8 +762,8 @@ define @dupq_ld1rqh_bf16_ptr(<8 x bfloat>* %a) #0 { ; CHECK-LABEL: dupq_ld1rqh_bf16_ptr: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <8 x bfloat>, <8 x bfloat>* %a %2 = tail call fast @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %1, i64 0) @@ -819,6 +819,110 @@ ret %3 } +define dso_local @dupq_ld1rqd_f64() { +; CHECK-LABEL: dupq_ld1rqd_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI61_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI61_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> , i64 0) + %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv2f64( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqw_f32() { +; CHECK-LABEL: dupq_ld1rqw_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI62_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI62_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> , i64 0) + %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv4f32( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqh_f16() { +; CHECK-LABEL: dupq_ld1rqh_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI63_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI63_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> , i64 0) + %2 = tail call fast @llvm.aarch64.sve.dupq.lane.nxv8f16( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqh_bf16() #0 { +; CHECK-LABEL: dupq_ld1rqh_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI64_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI64_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = call @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> , i64 0) + %2 = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqd_i64() { +; CHECK-LABEL: dupq_ld1rqd_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI65_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI65_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, <2 x i64> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqw_i32() { +; CHECK-LABEL: dupq_ld1rqw_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI66_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI66_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqh_i16() { +; CHECK-LABEL: dupq_ld1rqh_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI67_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI67_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, <8 x i16> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %1, i64 0) + ret %2 +} + +define dso_local @dupq_ld1rqb_i8() { +; CHECK-LABEL: dupq_ld1rqb_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI68_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI68_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: ret + %1 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, <16 x i8> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %1, i64 0) + ret %2 +} + declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64)