diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -961,6 +961,12 @@ LLVMVectorElementType<0>], [IntrNoMem]>; + class AdvSIMD_SVE_DUPQ_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_i64_ty], + [IntrNoMem]>; + class AdvSIMD_SVE_EXPA_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMVectorOfBitcastsToInt<0>], @@ -1474,6 +1480,7 @@ def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic; +def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic; def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic; def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic; def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,6 +745,7 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3036,6 +3036,8 @@ case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_dupq_lane: + return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); @@ -7512,6 +7514,54 @@ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); } +SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + if (!isTypeLegal(VT) || !VT.isScalableVector()) + return SDValue(); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + // The DUPQ operation is indepedent of element type so normalise to i64s. + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + SDValue Idx128 = Op.getOperand(2); + + // DUPQ can be used when idx is in range. + auto *CIdx = dyn_cast(Idx128); + if (CIdx && (CIdx->getZExtValue() <= 3)) { + SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); + SDNode *DUPQ = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + } + + // The ACLE says this must produce the same result as: + // svtbl(data, svadd_x(svptrue_b64(), + // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), + // index * 2)) + SDValue One = DAG.getConstant(1, DL, MVT::i64); + SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); + + // create the vector 0,1,0,1,... + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, + DL, MVT::nxv2i64, Zero, One); + SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); + + // create the vector idx64,idx64+1,idx64,idx64+1,... + SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); + SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); + SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); + + // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... + SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); + return DAG.getNode(ISD::BITCAST, DL, VT, TBL); +} + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -297,6 +297,179 @@ } ; +; DUPQ +; + +define @dupq_i8( %a) { +; CHECK-LABEL: dupq_i8: +; CHECK: mov z0.q, q0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv16i8( %a, i64 0) + ret %out +} + +define @dupq_i16( %a) { +; CHECK-LABEL: dupq_i16: +; CHECK: mov z0.q, z0.q[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8i16( %a, i64 1) + ret %out +} + +define @dupq_i32( %a) { +; CHECK-LABEL: dupq_i32: +; CHECK: mov z0.q, z0.q[2] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv4i32( %a, i64 2) + ret %out +} + +define @dupq_i64( %a) { +; CHECK-LABEL: dupq_i64: +; CHECK: mov z0.q, z0.q[3] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %a, i64 3) + ret %out +} + +define @dupq_f16( %a) { +; CHECK-LABEL: dupq_f16: +; CHECK: mov z0.q, q0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8f16( %a, i64 0) + ret %out +} + +define @dupq_f32( %a) { +; CHECK-LABEL: dupq_f32: +; CHECK: mov z0.q, z0.q[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv4f32( %a, i64 1) + ret %out +} + +define @dupq_f64( %a) { +; CHECK-LABEL: dupq_f64: +; CHECK: mov z0.q, z0.q[2] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv2f64( %a, i64 2) + ret %out +} + +; +; DUPQ_LANE +; + +define @dupq_lane_i8( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i8: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK-NEXT: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv16i8( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_i16( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i16: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8i16( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_i32( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i32: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv4i32( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_i64( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i64: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_f16( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f16: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv8f16( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_f32( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f32: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv4f32( %a, i64 %idx) + ret %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define @dupq_lane_f64( %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f64: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv2f64( %a, i64 %idx) + ret %out +} + +; NOTE: Index out of range (0-3) +define @dupq_i64_range( %a) { +; CHECK-LABEL: dupq_i64_range: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[Z3:z[0-9]+]].d, [[Z2]].d, #8 +; CHECK: tbl z0.d, { z0.d }, [[Z3]].d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %a, i64 4) + ret %out +} + +; ; EXT ; @@ -1616,6 +1789,14 @@ declare @llvm.aarch64.sve.compact.nxv4f32(, ) declare @llvm.aarch64.sve.compact.nxv2f64(, ) +declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2f64(, i64) + declare @llvm.aarch64.sve.ext.nxv16i8(, , i32) declare @llvm.aarch64.sve.ext.nxv8i16(, , i32) declare @llvm.aarch64.sve.ext.nxv4i32(, , i32)