diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -763,7 +763,6 @@ .customFor({{s32, s32}, {s64, s64}}); // TODO: Custom legalization for s128 - // TODO: v2s64, v2s32, v4s32, v4s16, v8s16 // TODO: Use generic lowering when custom lowering is not possible. auto always = [=](const LegalityQuery &Q) { return true; }; getActionDefinitionsBuilder(G_CTPOP) @@ -772,7 +771,13 @@ .widenScalarToNextPow2(0) .minScalarEltSameAsIf(always, 1, 0) .maxScalarEltSameAsIf(always, 1, 0) - .customFor({{s32, s32}, {s64, s64}}); + .customFor({{s32, s32}, + {s64, s64}, + {v2s64, v2s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v4s16, v4s16}, + {v8s16, v8s16}}); getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); @@ -1111,6 +1116,18 @@ // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg + // + // For 128 bit vector popcounts, we lower to the following sequence: + // cnt.16b v0, v0 // v8s16, v4s32, v2s64 + // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 + // uaddlp.4s v0, v0 // v4s32, v2s64 + // uaddlp.2d v0, v0 // v2s64 + // + // For 64 bit vector popcounts, we lower to the following sequence: + // cnt.8b v0, v0 // v4s16, v2s32 + // uaddlp.4h v0, v0 // v4s16, v2s32 + // uaddlp.2s v0, v0 // v2s32 + if (!ST->hasNEON() || MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -1119,27 +1136,68 @@ Register Val = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Val); - // TODO: Handle vector types. - assert(!Ty.isVector() && "Vector types not handled yet!"); assert(Ty == MRI.getType(Dst) && "Expected src and dst to have the same type!"); - // TODO: Handle s128. unsigned Size = Ty.getSizeInBits(); - assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!"); - if (Size == 32) - Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); - const LLT V8S8 = LLT::fixed_vector(8, LLT::scalar(8)); - Val = MIRBuilder.buildBitcast(V8S8, Val).getReg(0); - auto CTPOP = MIRBuilder.buildCTPOP(V8S8, Val); - auto UADDLV = - MIRBuilder - .buildIntrinsic(Intrinsic::aarch64_neon_uaddlv, {LLT::scalar(32)}, - /*HasSideEffects = */ false) - .addUse(CTPOP.getReg(0)); - if (Size == 64) - MIRBuilder.buildZExt(Dst, UADDLV); + + // Pre-conditioning: widen Val up to the nearest vector type. + // s32,s64,v4s16,v2s32 -> v8i8 + // v8s16,v4s32,v2s64 -> v16i8 + LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); + if (Ty.isScalar()) { + // TODO: Handle s128. + assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!"); + if (Size == 32) { + Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); + } + } + Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); + + // Count bits in each byte-sized lane. + auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); + + // Sum across lanes. + Register HSum = CTPOP.getReg(0); + SmallVector> HAdds; + if (Ty.isScalar()) { + HAdds.emplace_back(LLT::scalar(32), AArch64::UADDLVv8i8v); + } else if (Ty == LLT::fixed_vector(8, 16)) { + HAdds.emplace_back(LLT::fixed_vector(8, 16), AArch64::UADDLPv16i8_v8i16); + } else if (Ty == LLT::fixed_vector(4, 32)) { + HAdds.emplace_back(LLT::fixed_vector(8, 16), AArch64::UADDLPv16i8_v8i16); + HAdds.emplace_back(LLT::fixed_vector(4, 32), AArch64::UADDLPv8i16_v4i32); + } else if (Ty == LLT::fixed_vector(2, 64)) { + HAdds.emplace_back(LLT::fixed_vector(8, 16), AArch64::UADDLPv16i8_v8i16); + HAdds.emplace_back(LLT::fixed_vector(4, 32), AArch64::UADDLPv8i16_v4i32); + HAdds.emplace_back(LLT::fixed_vector(2, 64), AArch64::UADDLPv4i32_v2i64); + } else if (Ty == LLT::fixed_vector(4, 16)) { + HAdds.emplace_back(LLT::fixed_vector(4, 16), AArch64::UADDLPv8i8_v4i16); + } else if (Ty == LLT::fixed_vector(2, 32)) { + HAdds.emplace_back(LLT::fixed_vector(4, 16), AArch64::UADDLPv8i8_v4i16); + HAdds.emplace_back(LLT::fixed_vector(2, 32), AArch64::UADDLPv4i16_v2i32); + } else + llvm_unreachable("unexpected vector shape"); + MachineInstrBuilder UADD; + for (auto &HAdd : HAdds) { + LLT HTy; + unsigned Opc; + std::tie(HTy, Opc) = HAdd; + UADD = MIRBuilder.buildInstr(Opc, {HTy}, {HSum}); + constrainSelectedInstRegOperands(*UADD, *ST->getInstrInfo(), + *MRI.getTargetRegisterInfo(), + *ST->getRegBankInfo()); + HSum = UADD.getReg(0); + } + + // Post-conditioning. + if (Ty.isScalar() && Size == 64) + MIRBuilder.buildZExt(Dst, UADD); else - UADDLV->getOperand(0).setReg(Dst); + UADD->getOperand(0).setReg(Dst); + + constrainSelectedInstRegOperands(*UADD, *ST->getInstrInfo(), + *MRI.getTargetRegisterInfo(), + *ST->getRegBankInfo()); MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir @@ -48,8 +48,8 @@ ; CHECK: %copy:_(s32) = COPY $w0 ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32) ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: %ctpop:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) ; CHECK: $w0 = COPY %ctpop(s32) ; CHECK: RET_ReallyLR implicit $w0 %copy:_(s32) = COPY $w0 @@ -68,9 +68,9 @@ ; CHECK: liveins: $x0 ; CHECK: %copy:_(s64) = COPY $x0 ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) - ; CHECK: %ctpop:_(s64) = G_ZEXT [[INT]](s32) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) + ; CHECK: %ctpop:_(s64) = G_ZEXT [[UADDLVv8i8v]](s32) ; CHECK: $x0 = COPY %ctpop(s64) ; CHECK: RET_ReallyLR implicit $x0 %copy:_(s64) = COPY $x0 @@ -93,9 +93,9 @@ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32) ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]] ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[UADDLVv8i8v]](s32) ; CHECK: %ext:_(s32) = COPY [[COPY]](s32) ; CHECK: $w0 = COPY %ext(s32) ; CHECK: RET_ReallyLR implicit $w0 @@ -121,9 +121,9 @@ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32) ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]] ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[UADDLVv8i8v]](s32) ; CHECK: %ext:_(s32) = COPY [[COPY]](s32) ; CHECK: $w0 = COPY %ext(s32) ; CHECK: RET_ReallyLR implicit $w0 @@ -149,9 +149,9 @@ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32) ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]] ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[UADDLVv8i8v]](s32) ; CHECK: %ext:_(s32) = COPY [[COPY]](s32) ; CHECK: $w0 = COPY %ext(s32) ; CHECK: RET_ReallyLR implicit $w0 @@ -176,9 +176,9 @@ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32) ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]] ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64) - ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16(s32) = UADDLVv8i8v [[CTPOP]](<8 x s8>) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[UADDLVv8i8v]](s32) ; CHECK: %ext:_(s32) = COPY [[COPY]](s32) ; CHECK: $w0 = COPY %ext(s32) ; CHECK: RET_ReallyLR implicit $w0 @@ -188,3 +188,112 @@ %ext:_(s32) = G_ANYEXT %ctpop(s16) $w0 = COPY %ext(s32) RET_ReallyLR implicit $w0 + +... +--- +name: custom_8x16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: custom_8x16 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr128(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>) + ; CHECK: [[UADDLPv16i8_v8i16_:%[0-9]+]]:fpr128(<8 x s16>) = UADDLPv16i8_v8i16 [[CTPOP]](<16 x s8>) + ; CHECK: $q0 = COPY [[UADDLPv16i8_v8i16_]](<8 x s16>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = G_CTPOP %0(<8 x s16>) + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: custom_4x32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: custom_4x32 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr128(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>) + ; CHECK: [[UADDLPv16i8_v8i16_:%[0-9]+]]:fpr128(<8 x s16>) = UADDLPv16i8_v8i16 [[CTPOP]](<16 x s8>) + ; CHECK: [[UADDLPv8i16_v4i32_:%[0-9]+]]:fpr128(<4 x s32>) = UADDLPv8i16_v4i32 [[UADDLPv16i8_v8i16_]](<8 x s16>) + ; CHECK: $q0 = COPY [[UADDLPv8i16_v4i32_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = G_CTPOP %0(<4 x s32>) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: custom_2x64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: custom_2x64 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr128(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>) + ; CHECK: [[UADDLPv16i8_v8i16_:%[0-9]+]]:fpr128(<8 x s16>) = UADDLPv16i8_v8i16 [[CTPOP]](<16 x s8>) + ; CHECK: [[UADDLPv8i16_v4i32_:%[0-9]+]]:fpr128(<4 x s32>) = UADDLPv8i16_v4i32 [[UADDLPv16i8_v8i16_]](<8 x s16>) + ; CHECK: [[UADDLPv4i32_v2i64_:%[0-9]+]]:fpr128(<2 x s64>) = UADDLPv4i32_v2i64 [[UADDLPv8i16_v4i32_]](<4 x s32>) + ; CHECK: $q0 = COPY [[UADDLPv4i32_v2i64_]](<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<2 x s64>) = COPY $q0 + %1:_(<2 x s64>) = G_CTPOP %0(<2 x s64>) + $q0 = COPY %1(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: custom_4x16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: custom_4x16 + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLPv8i8_v4i16_:%[0-9]+]]:fpr64(<4 x s16>) = UADDLPv8i8_v4i16 [[CTPOP]](<8 x s8>) + ; CHECK: $d0 = COPY [[UADDLPv8i8_v4i16_]](<4 x s16>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s16>) = G_CTPOP %0(<4 x s16>) + $d0 = COPY %1(<4 x s16>) + RET_ReallyLR implicit $d0 + +... +--- +name: custom_2x32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: custom_2x32 + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK: [[CTPOP:%[0-9]+]]:fpr64(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[UADDLPv8i8_v4i16_:%[0-9]+]]:fpr64(<4 x s16>) = UADDLPv8i8_v4i16 [[CTPOP]](<8 x s8>) + ; CHECK: [[UADDLPv4i16_v2i32_:%[0-9]+]]:fpr64(<2 x s32>) = UADDLPv4i16_v2i32 [[UADDLPv8i8_v4i16_]](<4 x s16>) + ; CHECK: $d0 = COPY [[UADDLPv4i16_v2i32_]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s32>) = G_CTPOP %0(<2 x s32>) + $d0 = COPY %1(<2 x s32>) + RET_ReallyLR implicit $d0