diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15889,6 +15889,80 @@ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } +// Return true if the vector operation can guarantee only the first lane of its +// result contains data, with all bits in other lanes set to zero. +static bool isLanes1toNKnownZero(SDValue Op) { + switch (Op.getOpcode()) { + default: + return false; + case AArch64ISD::ANDV_PRED: + case AArch64ISD::EORV_PRED: + case AArch64ISD::FADDA_PRED: + case AArch64ISD::FADDV_PRED: + case AArch64ISD::FMAXNMV_PRED: + case AArch64ISD::FMAXV_PRED: + case AArch64ISD::FMINNMV_PRED: + case AArch64ISD::FMINV_PRED: + case AArch64ISD::ORV_PRED: + case AArch64ISD::SADDV_PRED: + case AArch64ISD::SMAXV_PRED: + case AArch64ISD::SMINV_PRED: + case AArch64ISD::UADDV_PRED: + case AArch64ISD::UMAXV_PRED: + case AArch64ISD::UMINV_PRED: + return true; + } +} + +// Return true if the value is a splatted vector of either a constant 0 integer +// or a constant positive 0 floating point value. +static bool isNullSplat(SDValue Splat) { + if (Splat.getOpcode() != ISD::SPLAT_VECTOR) + return false; + + SDValue Scalar = Splat.getOperand(0); + if (Splat.getValueType().isInteger()) + return isNullConstant(Scalar); + + return isNullFPConstant(Scalar); +} + +static SDValue performInsertVectorEltCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); + SDValue InsertVec = N->getOperand(0); + SDValue InsertElt = N->getOperand(1); + SDValue InsertIdx = N->getOperand(2); + + // We only care about inserts into the first element... + if (!isNullConstant(InsertIdx)) + return SDValue(); + // ...of a zero'd vector... + if (!isNullSplat(InsertVec)) + return SDValue(); + // ...where the inserted data was previously extracted... + if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue ExtractVec = InsertElt.getOperand(0); + SDValue ExtractIdx = InsertElt.getOperand(1); + + // ...from the first element of a vector. + if (!isNullConstant(ExtractIdx)) + return SDValue(); + + // If we get here we are effectively trying to zero lanes 1-N of a vector. + + // Ensure there's no type conversion going on. + if (N->getValueType(0) != ExtractVec.getValueType()) + return SDValue(); + + if (!isLanes1toNKnownZero(ExtractVec)) + return SDValue(); + + // The explicit zeroing is redundant. + return ExtractVec; +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -15969,8 +16043,12 @@ case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1S_IMM_MERGE_ZERO: return performGLD1Combine(N, DAG); - case ISD::INSERT_VECTOR_ELT: - return performPostLD1Combine(N, DCI, true); + case ISD::INSERT_VECTOR_ELT: { + SDValue Res = performInsertVectorEltCombine(N, DAG); + if (Res == SDValue()) + return performPostLD1Combine(N, DCI, true); + return Res; + } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DAG); case ISD::VECREDUCE_ADD: diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll @@ -0,0 +1,239 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Ensure we rely on the reduction's implicit zero filling. +define @andv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: andv_zero_fill: +; CHECK: andv b0, p0, z0.b +; CHECK-NEXT: ret + %t1 = call i8 @llvm.aarch64.sve.andv.nxv16i8( %pg, %a) + %t2 = insertelement zeroinitializer, i8 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @eorv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: eorv_zero_fill: +; CHECK: eorv h0, p0, z0.h +; CHECK-NEXT: ret + %t1 = call i16 @llvm.aarch64.sve.eorv.nxv8i16( %pg, %a) + %t2 = insertelement zeroinitializer, i16 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @fadda_zero_fill( %pg, double %init, %a) #0 { +; CHECK-LABEL: fadda_zero_fill: +; CHECK: fadda d0, p0, d0, z1.d +; CHECK-NEXT: ret + %t1 = call double @llvm.aarch64.sve.fadda.nxv2f64( %pg, double %init, %a) + %t2 = insertelement zeroinitializer, double %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @faddv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: faddv_zero_fill: +; CHECK: faddv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call float @llvm.aarch64.sve.faddv.nxv4f32( %pg, %a) + %t2 = insertelement zeroinitializer, float %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @fmaxv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: fmaxv_zero_fill: +; CHECK: fmaxv h0, p0, z0.h +; CHECK-NEXT: ret + %t1 = call half @llvm.aarch64.sve.fmaxv.nxv8f16( %pg, %a) + %t2 = insertelement zeroinitializer, half %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @fmaxnmv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: fmaxnmv_zero_fill: +; CHECK: fmaxnmv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call float @llvm.aarch64.sve.fmaxnmv.nxv2f32( %pg, %a) + %t2 = insertelement zeroinitializer, float %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @fminnmv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: fminnmv_zero_fill: +; CHECK: fminnmv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call float @llvm.aarch64.sve.fminnmv.nxv2f32( %pg, %a) + %t2 = insertelement zeroinitializer, float %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @fminv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: fminv_zero_fill: +; CHECK: fminv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call float @llvm.aarch64.sve.fminv.nxv2f32( %pg, %a) + %t2 = insertelement zeroinitializer, float %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @orv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: orv_zero_fill: +; CHECK: orv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call i32 @llvm.aarch64.sve.orv.nxv4i32( %pg, %a) + %t2 = insertelement zeroinitializer, i32 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @saddv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: saddv_zero_fill: +; CHECK: saddv d0, p0, z0.b +; CHECK-NEXT: ret + %t1 = call i64 @llvm.aarch64.sve.saddv.nxv16i8( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @smaxv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: smaxv_zero_fill: +; CHECK: smaxv d0, p0, z0.d +; CHECK-NEXT: ret + %t1 = call i64 @llvm.aarch64.sve.smaxv.nxv2i64( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @sminv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: sminv_zero_fill: +; CHECK: sminv s0, p0, z0.s +; CHECK-NEXT: ret + %t1 = call i32 @llvm.aarch64.sve.sminv.nxv4i32( %pg, %a) + %t2 = insertelement zeroinitializer, i32 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @uaddv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: uaddv_zero_fill: +; CHECK: uaddv d0, p0, z0.h +; CHECK-NEXT: ret + %t1 = call i64 @llvm.aarch64.sve.uaddv.nxv8i16( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @umaxv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: umaxv_zero_fill: +; CHECK: umaxv b0, p0, z0.b +; CHECK-NEXT: ret + %t1 = call i8 @llvm.aarch64.sve.umaxv.nxv16i8( %pg, %a) + %t2 = insertelement zeroinitializer, i8 %t1, i64 0 + ret %t2 +} + +; Ensure we rely on the reduction's implicit zero filling. +define @uminv_zero_fill( %pg, %a) #0 { +; CHECK-LABEL: uminv_zero_fill: +; CHECK: uminv d0, p0, z0.d +; CHECK-NEXT: ret + %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 0 + ret %t2 +} + +; Ensure explicit zeroing when inserting into a lane other than 0. +; NOTE: This test doesn't care about the exact way an insert is code generated, +; so only checks the presence of one instruction from the expected chain. +define @zero_fill_non_zero_index( %pg, %a) #0 { +; CHECK-LABEL: zero_fill_non_zero_index: +; CHECK: uminv d{{[0-9]+}}, p0, z0.d +; CHECK: mov z{{[0-9]+}}.d, p{{[0-9]+}}/m, x{{[0-9]+}} +; CHECK: ret + %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 1 + ret %t2 +} + +; Ensure explicit zeroing when the result vector is larger than that produced by +; the reduction instruction. +define @zero_fill_type_mismatch( %pg, %a) #0 { +; CHECK-LABEL: zero_fill_type_mismatch: +; CHECK: uminv d0, p0, z0.d +; CHECK-NEXT: mov z1.d, #0 +; CHECK-NEXT: ret + %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) + %t2 = insertelement zeroinitializer, i64 %t1, i64 0 + ret %t2 +} + +; Ensure explicit zeroing when extracting an element from an operation that +; cannot guarantee lanes 1-N are zero. +; NOTE: This test doesn't care about the exact way an insert is code generated, +; so only checks the presence of one instruction from the expected chain. +define @zero_fill_no_zero_upper_lanes( %pg, %a) #0 { +; CHECK-LABEL: zero_fill_no_zero_upper_lanes: +; CHECK: umin z{{[0-9]+}}.d, p0/m, z0.d, z0.d +; CHECK: mov z{{[0-9]+}}.d, p{{[0-9]+}}/m, x{{[0-9]+}} +; CHECK: ret + %t1 = call @llvm.aarch64.sve.umin.nxv2i64( %pg, %a, %a) + %t2 = extractelement %t1, i64 0 + %t3 = insertelement zeroinitializer, i64 %t2, i64 0 + ret %t3 +} + +declare i8 @llvm.aarch64.sve.andv.nxv2i8(, ) +declare i8 @llvm.aarch64.sve.andv.nxv16i8(, ) + +declare i8 @llvm.aarch64.sve.eorv.nxv2i8(, ) +declare i16 @llvm.aarch64.sve.eorv.nxv8i16(, ) + +declare float @llvm.aarch64.sve.fadda.nxv2f32(, float, ) +declare double @llvm.aarch64.sve.fadda.nxv2f64(, double, ) + +declare float @llvm.aarch64.sve.faddv.nxv2f32(, ) +declare float @llvm.aarch64.sve.faddv.nxv4f32(, ) + +declare float @llvm.aarch64.sve.fmaxnmv.nxv2f32(, ) + +declare half @llvm.aarch64.sve.fmaxv.nxv8f16(, ) +declare float @llvm.aarch64.sve.fmaxv.nxv2f32(, ) + +declare float @llvm.aarch64.sve.fminv.nxv2f32(, ) + +declare float @llvm.aarch64.sve.fminnmv.nxv2f32(, ) + +declare i8 @llvm.aarch64.sve.orv.nxv2i8(, ) +declare i32 @llvm.aarch64.sve.orv.nxv4i32(, ) + +declare i64 @llvm.aarch64.sve.saddv.nxv2i8(, ) +declare i64 @llvm.aarch64.sve.saddv.nxv16i8(, ) + +declare i8 @llvm.aarch64.sve.smaxv.nxv2i8(, ) +declare i64 @llvm.aarch64.sve.smaxv.nxv2i64(, ) + +declare i8 @llvm.aarch64.sve.sminv.nxv2i8(, ) +declare i32 @llvm.aarch64.sve.sminv.nxv4i32(, ) + +declare i64 @llvm.aarch64.sve.uaddv.nxv2i8(, ) +declare i64 @llvm.aarch64.sve.uaddv.nxv8i16(, ) + +declare i8 @llvm.aarch64.sve.umaxv.nxv2i8(, ) +declare i8 @llvm.aarch64.sve.umaxv.nxv16i8(, ) + +declare i8 @llvm.aarch64.sve.uminv.nxv2i8(, ) +declare i64 @llvm.aarch64.sve.uminv.nxv2i64(, ) + +declare @llvm.aarch64.sve.umin.nxv2i64(, , ) + +attributes #0 = { "target-features"="+sve" }