diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19256,6 +19256,41 @@ return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); } +static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + SDValue Insert = N->getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + if (!Insert.getOperand(0).isUndef()) + return SDValue(); + + uint64_t IdxInsert = Insert.getConstantOperandVal(2); + uint64_t IdxDupLane = N->getConstantOperandVal(1); + if (IdxInsert != IdxDupLane) + return SDValue(); + + SDValue Bitcast = Insert.getOperand(1); + if (Bitcast.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue Subvec = Bitcast.getOperand(0); + EVT SubvecVT = Subvec.getValueType(); + if (!SubvecVT.is128BitVector()) + return SDValue(); + EVT NewSubvecVT = + getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType()); + + SDLoc DL(N); + SDValue NewInsert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT, + DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2)); + SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT, + NewInsert, N->getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -19342,6 +19377,8 @@ return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: return performDUPCombine(N, DCI); + case AArch64ISD::DUPLANE128: + return performDupLane128Combine(N, DAG); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -726,8 +726,8 @@ define @dupq_ld1rqd_f64(<2 x double>* %a) { ; CHECK-LABEL: dupq_ld1rqd_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <2 x double>, <2 x double>* %a %2 = tail call fast @llvm.vector.insert.nxv2f64.v2f64( undef, <2 x double> %1, i64 0) @@ -738,8 +738,8 @@ define @dupq_ld1rqw_f32(<4 x float>* %a) { ; CHECK-LABEL: dupq_ld1rqw_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %a %2 = tail call fast @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %1, i64 0) @@ -750,8 +750,8 @@ define @dupq_ld1rqh_f16(<8 x half>* %a) { ; CHECK-LABEL: dupq_ld1rqh_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <8 x half>, <8 x half>* %a %2 = tail call fast @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %1, i64 0) @@ -762,8 +762,8 @@ define @dupq_ld1rqh_bf16(<8 x bfloat>* %a) #0 { ; CHECK-LABEL: dupq_ld1rqh_bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] ; CHECK-NEXT: ret %1 = load <8 x bfloat>, <8 x bfloat>* %a %2 = tail call fast @llvm.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %1, i64 0)