diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6033,7 +6033,7 @@ cast(N1)->getAPIntValue().countLeadingZeros()); } - bool UseNPQ = false; + bool UseNPQ = false, UsePreShift = false, UsePostShift = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; auto BuildUDIVPattern = [&](ConstantSDNode *C) { @@ -6041,18 +6041,18 @@ return false; const APInt& Divisor = C->getAPIntValue(); - bool SelNPQ = false; - APInt Magic(Divisor.getBitWidth(), 0); - unsigned PreShift = 0, PostShift = 0; + SDValue PreShift, MagicFactor, NPQFactor, PostShift; // Magic algorithm doesn't work for division by 1. We need to emit a select // at the end. - // TODO: Use undef values for divisor of 1. - if (!Divisor.isOne()) { + if (Divisor.isOne()) { + PreShift = PostShift = DAG.getUNDEF(ShSVT); + MagicFactor = NPQFactor = DAG.getUNDEF(SVT); + } else { UnsignedDivisionByConstantInfo magics = UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros); - Magic = std::move(magics.Magic); + MagicFactor = DAG.getConstant(magics.Magic, dl, SVT); assert(magics.PreShift < Divisor.getBitWidth() && "We shouldn't generate an undefined shift!"); @@ -6060,19 +6060,21 @@ "We shouldn't generate an undefined shift!"); assert((!magics.IsAdd || magics.PreShift == 0) && "Unexpected pre-shift"); - PreShift = magics.PreShift; - PostShift = magics.PostShift; - SelNPQ = magics.IsAdd; - } - - PreShifts.push_back(DAG.getConstant(PreShift, dl, ShSVT)); - MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT)); - NPQFactors.push_back( - DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1) - : APInt::getZero(EltBits), - dl, SVT)); - PostShifts.push_back(DAG.getConstant(PostShift, dl, ShSVT)); - UseNPQ |= SelNPQ; + PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT); + PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT); + NPQFactor = DAG.getConstant( + magics.IsAdd ? APInt::getOneBitSet(EltBits, EltBits - 1) + : APInt::getZero(EltBits), + dl, SVT); + UseNPQ |= magics.IsAdd; + UsePreShift |= magics.PreShift != 0; + UsePostShift |= magics.PostShift != 0; + } + + PreShifts.push_back(PreShift); + MagicFactors.push_back(MagicFactor); + NPQFactors.push_back(NPQFactor); + PostShifts.push_back(PostShift); return true; }; @@ -6102,8 +6104,10 @@ } SDValue Q = N0; - Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); - Created.push_back(Q.getNode()); + if (UsePreShift) { + Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); + Created.push_back(Q.getNode()); + } // FIXME: We should support doing a MUL in a wider type. auto GetMULHU = [&](SDValue X, SDValue Y) { @@ -6152,8 +6156,10 @@ Created.push_back(Q.getNode()); } - Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift); - Created.push_back(Q.getNode()); + if (UsePostShift) { + Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift); + Created.push_back(Q.getNode()); + } EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -171,20 +171,17 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SDAG-LABEL: combine_vec_udiv_nonuniform4: ; SDAG: // %bb.0: +; SDAG-NEXT: movi v1.16b, #171 ; SDAG-NEXT: adrp x8, .LCPI4_0 -; SDAG-NEXT: adrp x9, .LCPI4_2 -; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; SDAG-NEXT: adrp x8, .LCPI4_1 -; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_2] +; SDAG-NEXT: adrp x9, .LCPI4_1 ; SDAG-NEXT: umull2 v2.8h, v0.16b, v1.16b +; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_1] ; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b +; SDAG-NEXT: and v0.16b, v0.16b, v3.16b ; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; SDAG-NEXT: adrp x8, .LCPI4_3 -; SDAG-NEXT: ushl v1.16b, v1.16b, v2.16b -; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] -; SDAG-NEXT: and v1.16b, v1.16b, v3.16b -; SDAG-NEXT: and v0.16b, v0.16b, v2.16b +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] +; SDAG-NEXT: ushr v1.16b, v1.16b, #7 +; SDAG-NEXT: and v1.16b, v1.16b, v2.16b ; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b ; SDAG-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -688,7 +688,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = ; SSE2-NEXT: pmulhuw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubw %xmm1, %xmm2 @@ -707,7 +707,7 @@ ; ; SSE41-LABEL: pr38477: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = ; SSE41-NEXT: pmulhuw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psubw %xmm1, %xmm2