Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -425,6 +425,12 @@ return true; } + /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded + /// using generic code in SelectionDAGBuilder. + virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const { + return true; + } + /// Return true if it is profitable to convert a select of FP constants into /// a constant pool load whose address depends on the select condition. The /// parameter may be used to differentiate a select with FP compare from Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7105,10 +7105,16 @@ return; } case Intrinsic::get_active_lane_mask: { + EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); SDValue Index = getValue(I.getOperand(0)); - SDValue TripCount = getValue(I.getOperand(1)); EVT ElementVT = Index.getValueType(); - EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + if (!TLI.shouldExpandGetActiveLaneMask(CCVT, ElementVT)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + SDValue TripCount = getValue(I.getOperand(1)); auto VecTy = CCVT.changeVectorElementType(ElementVT); SDValue VectorIndex, VectorTripCount; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -844,6 +844,8 @@ EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown = false) const override; + bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1503,6 +1503,24 @@ } } +bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, + EVT OpVT) const { + // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). + if (!Subtarget->hasSVE()) + return true; + + // We can only support legal predicate result types. + if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && + ResVT != MVT::nxv16i1) + return true; + + // The whilelo instruction only works with i32 or i64 scalar inputs. + if (OpVT != MVT::i32 && OpVT != MVT::i64) + return true; + + return false; +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -4290,6 +4308,12 @@ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } + case Intrinsic::get_active_lane_mask: { + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, + Op.getOperand(1), Op.getOperand(2)); + } } } Index: llvm/test/CodeGen/AArch64/active_lane_mask.ll =================================================================== --- llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -4,43 +4,7 @@ define @lane_mask_nxv16i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_nxv16i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: incw z1.s -; CHECK-NEXT: add z3.s, z2.s, z0.s -; CHECK-NEXT: incw z0.s, all, mul #2 -; CHECK-NEXT: add z4.s, z2.s, z1.s -; CHECK-NEXT: incw z1.s, all, mul #2 -; CHECK-NEXT: cmphi p1.s, p0/z, z2.s, z3.s -; CHECK-NEXT: add z0.s, z2.s, z0.s -; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z4.s -; CHECK-NEXT: add z1.s, z2.s, z1.s -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z0.s -; CHECK-NEXT: cmphi p3.s, p0/z, z2.s, z1.s -; CHECK-NEXT: mov z2.s, w1 -; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h -; CHECK-NEXT: cmphi p3.s, p0/z, z2.s, z4.s -; CHECK-NEXT: cmphi p4.s, p0/z, z2.s, z3.s -; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b -; CHECK-NEXT: uzp1 p2.h, p4.h, p3.h -; CHECK-NEXT: cmphi p3.s, p0/z, z2.s, z0.s -; CHECK-NEXT: cmphi p0.s, p0/z, z2.s, z1.s -; CHECK-NEXT: ptrue p4.b -; CHECK-NEXT: uzp1 p0.h, p3.h, p0.h -; CHECK-NEXT: not p1.b, p4/z, p1.b -; CHECK-NEXT: uzp1 p0.b, p2.b, p0.b -; CHECK-NEXT: and p0.b, p4/z, p1.b, p0.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: whilelo p0.b, w0, w1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 %index, i32 %TC) ret %active.lane.mask @@ -49,23 +13,7 @@ define @lane_mask_nxv8i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_nxv8i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: add z0.s, z2.s, z0.s -; CHECK-NEXT: incw z1.s -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: add z1.s, z2.s, z1.s -; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z0.s -; CHECK-NEXT: cmphi p3.s, p0/z, z2.s, z1.s -; CHECK-NEXT: mov z2.s, w1 -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h -; CHECK-NEXT: cmphi p3.s, p0/z, z2.s, z1.s -; CHECK-NEXT: cmphi p0.s, p0/z, z2.s, z0.s -; CHECK-NEXT: not p2.b, p1/z, p2.b -; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h -; CHECK-NEXT: and p0.b, p1/z, p2.b, p0.b +; CHECK-NEXT: whilelo p0.h, w0, w1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 %index, i32 %TC) ret %active.lane.mask @@ -74,14 +22,7 @@ define @lane_mask_nxv4i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_nxv4i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: index z0.s, w0, #1 -; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: mov z2.s, w1 -; CHECK-NEXT: cmphi p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z0.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: whilelo p0.s, w0, w1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 %index, i32 %TC) ret %active.lane.mask @@ -90,21 +31,7 @@ define @lane_mask_nxv2i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_nxv2i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: adr z0.d, [z1.d, z0.d, uxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: and z2.d, z2.d, #0xffffffff -; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z1.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: whilelo p0.d, w0, w1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 %index, i32 %TC) ret %active.lane.mask @@ -113,73 +40,7 @@ define @lane_mask_nxv16i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv16i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z3.d, x0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: mov z5.d, z1.d -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: incd z5.d, all, mul #2 -; CHECK-NEXT: add z4.d, z3.d, z0.d -; CHECK-NEXT: add z6.d, z3.d, z1.d -; CHECK-NEXT: add z7.d, z3.d, z2.d -; CHECK-NEXT: add z24.d, z3.d, z5.d -; CHECK-NEXT: incd z0.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z4.d -; CHECK-NEXT: incd z1.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z6.d -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z7.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z24.d -; CHECK-NEXT: incd z2.d, all, mul #4 -; CHECK-NEXT: incd z5.d, all, mul #4 -; CHECK-NEXT: add z0.d, z3.d, z0.d -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: uzp1 p2.s, p3.s, p4.s -; CHECK-NEXT: add z1.d, z3.d, z1.d -; CHECK-NEXT: add z2.d, z3.d, z2.d -; CHECK-NEXT: add z5.d, z3.d, z5.d -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z0.d -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z1.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z2.d -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z5.d -; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s -; CHECK-NEXT: uzp1 p3.s, p4.s, p5.s -; CHECK-NEXT: mov z3.d, x1 -; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z6.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z4.d -; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b -; CHECK-NEXT: uzp1 p2.s, p4.s, p3.s -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z7.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z24.d -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z1.d -; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z2.d -; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z5.d -; CHECK-NEXT: uzp1 p0.s, p5.s, p0.s -; CHECK-NEXT: ptrue p5.b -; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h -; CHECK-NEXT: uzp1 p0.h, p4.h, p0.h -; CHECK-NEXT: not p1.b, p5/z, p1.b -; CHECK-NEXT: uzp1 p0.b, p2.b, p0.b -; CHECK-NEXT: and p0.b, p5/z, p1.b, p0.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: whilelo p0.b, x0, x1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %TC) ret %active.lane.mask @@ -188,43 +49,7 @@ define @lane_mask_nxv8i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv8i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z2.d, x0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: add z3.d, z2.d, z0.d -; CHECK-NEXT: incd z0.d, all, mul #2 -; CHECK-NEXT: add z4.d, z2.d, z1.d -; CHECK-NEXT: incd z1.d, all, mul #2 -; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z3.d -; CHECK-NEXT: add z0.d, z2.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z4.d -; CHECK-NEXT: add z1.d, z2.d, z1.d -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z1.d -; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d -; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z3.d -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: uzp1 p2.s, p4.s, p3.s -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z0.d -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z1.d -; CHECK-NEXT: ptrue p4.h -; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s -; CHECK-NEXT: not p1.b, p4/z, p1.b -; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h -; CHECK-NEXT: and p0.b, p4/z, p1.b, p0.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: whilelo p0.h, x0, x1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index, i64 %TC) ret %active.lane.mask @@ -233,23 +58,7 @@ define @lane_mask_nxv4i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv4i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z2.d, x0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: add z0.d, z2.d, z0.d -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add z1.d, z2.d, z1.d -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z1.d -; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z1.d -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d -; CHECK-NEXT: not p2.b, p1/z, p2.b -; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s -; CHECK-NEXT: and p0.b, p1/z, p2.b, p0.b +; CHECK-NEXT: whilelo p0.s, x0, x1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index, i64 %TC) ret %active.lane.mask @@ -258,14 +67,7 @@ define @lane_mask_nxv2i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv2i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: index z0.d, x0, #1 -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: cmphi p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: whilelo p0.d, x0, x1 ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index, i64 %TC) ret %active.lane.mask