Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3878,13 +3878,42 @@ if (LoadSDNode *LD = dyn_cast(Op)) { unsigned ExtType = LD->getExtensionType(); switch (ExtType) { - default: break; - case ISD::SEXTLOAD: // '17' bits known - Tmp = LD->getMemoryVT().getScalarSizeInBits(); - return VTBits-Tmp+1; - case ISD::ZEXTLOAD: // '16' bits known - Tmp = LD->getMemoryVT().getScalarSizeInBits(); - return VTBits-Tmp; + default: break; + case ISD::SEXTLOAD: // '17' bits known + Tmp = LD->getMemoryVT().getScalarSizeInBits(); + return VTBits - Tmp + 1; + case ISD::ZEXTLOAD: // '16' bits known + Tmp = LD->getMemoryVT().getScalarSizeInBits(); + return VTBits - Tmp; + case ISD::NON_EXTLOAD: + if (const Constant *Cst = TLI->getTargetConstantFromLoad(LD)) { + // We only need to handle vectors - computeKnownBits should handle + // scalar cases. + Type *CstTy = Cst->getType(); + if (CstTy->isVectorTy() && + (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits()) { + Tmp = VTBits; + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + if (Constant *Elt = Cst->getAggregateElement(i)) { + if (auto *CInt = dyn_cast(Elt)) { + const APInt &Value = CInt->getValue(); + Tmp = std::min(Tmp, Value.getNumSignBits()); + continue; + } + if (auto *CFP = dyn_cast(Elt)) { + APInt Value = CFP->getValueAPF().bitcastToAPInt(); + Tmp = std::min(Tmp, Value.getNumSignBits()); + continue; + } + } + return 1; + } + return Tmp; + } + } + break; } } } Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -2112,11 +2112,9 @@ ; SSE2-LABEL: non_splat_minus_one_divisor_0: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: non_splat_minus_one_divisor_0: @@ -2175,38 +2173,36 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE2-LABEL: non_splat_minus_one_divisor_1: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] -; SSE2-NEXT: psubb %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: non_splat_minus_one_divisor_1: