diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5622,7 +5622,15 @@ auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0)); auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1)); - MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); + + LLT CountTy = LLT::scalar(Log2_64_Ceil(SrcTy.getSizeInBits())); + if (CountTy.getSizeInBits() < DstTy.getSizeInBits()) { + LoCTPOP = MIRBuilder.buildTrunc(CountTy, LoCTPOP); + HiCTPOP = MIRBuilder.buildTrunc(CountTy, HiCTPOP); + auto Add = MIRBuilder.buildAdd(CountTy, HiCTPOP, LoCTPOP); + MIRBuilder.buildZExt(DstReg, Add); + } else + MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); MI.eraseFromParent(); return Legalized; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -764,7 +764,6 @@ getActionDefinitionsBuilder({G_SBFX, G_UBFX}) .customFor({{s32, s32}, {s64, s64}}); - // TODO: Custom legalization for s128 // TODO: Use generic lowering when custom lowering is not possible. auto always = [=](const LegalityQuery &Q) { return true; }; getActionDefinitionsBuilder(G_CTPOP) @@ -775,6 +774,7 @@ .maxScalarEltSameAsIf(always, 1, 0) .customFor({{s32, s32}, {s64, s64}, + {s128, s128}, {v2s64, v2s64}, {v2s32, v2s32}, {v4s32, v4s32}, @@ -1151,8 +1151,7 @@ // v8s16,v4s32,v2s64 -> v16i8 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); if (Ty.isScalar()) { - // TODO: Handle s128. - assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!"); + assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); if (Size == 32) { Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); } @@ -1198,7 +1197,7 @@ } // Post-conditioning. - if (Ty.isScalar() && Size == 64) + if (Ty.isScalar() && (Size == 64 || Size == 128)) MIRBuilder.buildZExt(Dst, UADD); else UADD->getOperand(0).setReg(Dst); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir @@ -78,6 +78,30 @@ $x0 = COPY %ctpop(s64) RET_ReallyLR implicit $x0 +... +--- +name: s128_lower +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: s128_lower + ; CHECK: liveins: $q0 + ; CHECK: %copy:_(s128) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST %copy(s128) + ; CHECK: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: %ctpop:_(s128) = G_MERGE_VALUES [[MV]](s64), [[C1]](s64) + ; CHECK: $q0 = COPY %ctpop(s128) + ; CHECK: RET_ReallyLR implicit $q0 + %copy:_(s128) = COPY $q0 + %ctpop:_(s128) = G_CTPOP %copy(s128) + $q0 = COPY %ctpop(s128) + RET_ReallyLR implicit $q0 + ... --- name: widen_s16 diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -5,15 +5,12 @@ define i8 @popcount128(i128* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount128: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: uaddlv h1, v0.16b ; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret Entry: @@ -30,27 +27,35 @@ define i16 @popcount256(i256* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount256: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: ldr x9, [x0, #24] -; CHECK-NEXT: ldr d1, [x0, #16] +; CHECK-NEXT: ldr x11, [x0] +; CHECK-NEXT: ldr x10, [x0, #8] +; CHECK-NEXT: ldr x9, [x0, #16] +; CHECK-NEXT: ldr x8, [x0, #24] ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: mov v0.d[0], x11 +; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.d[0], x9 +; CHECK-NEXT: mov v1.d[1], x8 ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlv h1, v0.16b +; CHECK-NEXT: uaddlv h2, v0.16b ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: cnt v1.16b, v1.16b +; CHECK-NEXT: uaddlv h2, v1.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], x8 -; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlv h1, v0.16b -; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: // implicit-def: $w9 +; CHECK-NEXT: // kill: def $x8 killed $w8 +; CHECK-NEXT: // kill: def $x9 killed $w9 +; CHECK-NEXT: bfi x8, x9, #32, #32 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret Entry: %1 = load i256, i256* %0, align 16 @@ -66,16 +71,19 @@ ; CHECK-LABEL: popcount1x128: ; CHECK: // %bb.0: // %Entry ; CHECK-NEXT: // implicit-def: $q0 -; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov v0.d[0], x0 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: uaddlv h1, v0.16b ; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: // kill: def $x0 killed $w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x1, v0.d[1] +; CHECK-NEXT: // kill: def $x8 killed $w8 +; CHECK-NEXT: bfi x0, x8, #32, #32 +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)