Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1179,29 +1179,64 @@ Known.Zero |= ~InMask; break; } - case ISD::BITCAST: + case ISD::BITCAST: { + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits(); + // If this is an FP->Int bitcast and if the sign bit is the only // thing demanded, turn this into a FGETSIGN. - if (!TLO.LegalOperations() && !VT.isVector() && - !Op.getOperand(0).getValueType().isVector() && + if (!TLO.LegalOperations() && !VT.isVector() && !SrcVT.isVector() && NewMask == APInt::getSignMask(Op.getValueSizeInBits()) && - Op.getOperand(0).getValueType().isFloatingPoint()) { + SrcVT.isFloatingPoint()) { bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT); - bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); - if ((OpVTLegal || i32Legal) && VT.isSimple() && - Op.getOperand(0).getValueType() != MVT::f16 && - Op.getOperand(0).getValueType() != MVT::f128) { + bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); + if ((OpVTLegal || i32Legal) && VT.isSimple() && SrcVT != MVT::f16 && + SrcVT != MVT::f128) { // Cannot eliminate/lower SHL for f128 yet. EVT Ty = OpVTLegal ? VT : MVT::i32; // Make a FGETSIGN + SHL to move the sign bit into the appropriate // place. We expect the SHL to be eliminated by other optimizations. - SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0)); + SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Src); unsigned OpVTSizeInBits = Op.getValueSizeInBits(); if (!OpVTLegal && OpVTSizeInBits > 32) Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign); unsigned ShVal = Op.getValueSizeInBits() - 1; SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT); - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt)); + return TLO.CombineTo(Op, + TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt)); + } + } + // If bitcast from a vector and the mask covers entire elements, see if we + // can use SimplifyDemandedVectorElts. + // TODO - bigendian once we have test coverage. + // TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support. + if (SrcVT.isVector() && NumSrcEltBits > 1 && + (BitWidth % NumSrcEltBits) == 0 && + TLO.DAG.getDataLayout().isLittleEndian()) { + unsigned Scale = BitWidth / NumSrcEltBits; + auto GetDemandedSubMask = [&](APInt &DemandedSubElts) -> bool { + DemandedSubElts = APInt::getNullValue(Scale); + for (unsigned i = 0; i != Scale; ++i) { + unsigned Offset = i * NumSrcEltBits; + APInt Sub = DemandedMask.extractBits(NumSrcEltBits, Offset); + if (Sub.isAllOnesValue()) + DemandedSubElts.setBit(i); + else if (!Sub.isNullValue()) + return false; + } + return true; + }; + + APInt DemandedSubElts; + if (GetDemandedSubMask(DemandedSubElts)) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + APInt DemandedElts = APInt::getSplat(NumSrcElts, DemandedSubElts); + + APInt KnownUndef, KnownZero; + if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, + TLO, Depth + 1)) + return true; } } // If this is a bitcast, let computeKnownBits handle it. Only do this on a @@ -1211,6 +1246,7 @@ return false; } break; + } case ISD::ADD: case ISD::MUL: case ISD::SUB: { Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1826,12 +1826,6 @@ define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = shl <4 x i64> %a0, @@ -1846,9 +1840,6 @@ define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = and <4 x i64> %a0, Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1718,11 +1718,6 @@ define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: knotw %k0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = and <8 x i64> %__A, Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -4519,9 +4519,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -4541,9 +4539,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: ## xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca] @@ -4568,9 +4564,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1] @@ -4696,9 +4690,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -4718,9 +4710,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: ## xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] @@ -4745,9 +4735,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] @@ -6160,9 +6148,7 @@ ; X86-LABEL: test_mul_epi32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -6182,9 +6168,7 @@ ; X86-LABEL: test_mul_epi32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: ## xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca] @@ -6211,9 +6195,7 @@ ; X86-LABEL: test_mul_epi32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1] @@ -6349,9 +6331,7 @@ ; X86-LABEL: test_mul_epu32_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -6371,9 +6351,7 @@ ; X86-LABEL: test_mul_epu32_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: ## xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca] @@ -6400,9 +6378,7 @@ ; X86-LABEL: test_mul_epu32_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: ## xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1] Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -9655,7 +9655,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] ; X86-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -9675,7 +9675,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] +; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x28,0xca] @@ -9700,7 +9700,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1] @@ -9826,9 +9826,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: # xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] ; X86-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -9848,9 +9846,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: # xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x28,0xca] @@ -9875,9 +9871,7 @@ ; X86-LABEL: test_mask_mul_epi32_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: # xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1] @@ -10003,7 +9997,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10023,7 +10017,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] +; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xca] @@ -10048,7 +10042,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] +; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1] @@ -10174,9 +10168,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: # xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10196,9 +10188,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10] -; X86-NEXT: # xmm2 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd2] +; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xca] @@ -10223,9 +10213,7 @@ ; X86-LABEL: test_mask_mul_epu32_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08] -; X86-NEXT: # xmm1 = mem[0],zero -; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9] +; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1] Index: llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll +++ llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll @@ -41,20 +41,15 @@ ret <2 x i64> %5 } -; TODO - blends are superfluous define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zero_pmuludq: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: combine_shuffle_zero_pmuludq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -62,7 +57,6 @@ ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -70,7 +64,6 @@ ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: retq @@ -82,23 +75,16 @@ ret <2 x i64> %5 } -; TODO - blends are superfluous define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zero_pmuludq_256: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -106,7 +92,6 @@ ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -114,7 +99,6 @@ ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/combine-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-shl.ll +++ llvm/trunk/test/CodeGen/X86/combine-shl.ll @@ -402,23 +402,21 @@ ; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: psrad $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $5, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: psrad $3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrad $3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: psrad $4, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1: @@ -466,23 +464,21 @@ ; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: psrad $7, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $7, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $6, %xmm3 -; SSE2-NEXT: psrad $5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrad $5, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: psrad $6, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1: @@ -533,23 +529,21 @@ ; SSE2-LABEL: combine_vec_shl_gt_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: psrld $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrld $4, %xmm3 -; SSE2-NEXT: psrld $3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrld $3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: psrld $4, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_gt_lshr1: @@ -600,23 +594,21 @@ ; SSE2-LABEL: combine_vec_shl_le_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: psrld $7, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $7, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrld $6, %xmm3 -; SSE2-NEXT: psrld $5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrld $5, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: psrld $6, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_le_lshr1: Index: llvm/trunk/test/CodeGen/X86/mulvi32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/mulvi32.ll +++ llvm/trunk/test/CodeGen/X86/mulvi32.ll @@ -39,24 +39,18 @@ define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { ; SSE2-LABEL: _mul2xi32b: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul2xi32b: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE42-NEXT: pmuludq %xmm0, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: retq ; ; AVX-LABEL: _mul2xi32b: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq @@ -349,27 +343,13 @@ ; ; SSE42-LABEL: _mul2xi64toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE42-NEXT: pmuludq %xmm1, %xmm0 ; SSE42-NEXT: retq ; -; AVX1-LABEL: _mul2xi64toi64a: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: _mul2xi64toi64a: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: _mul2xi64toi64a: +; AVX: # %bb.0: +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %f00 = extractelement <2 x i64> %0, i32 0 %f01 = extractelement <2 x i64> %0, i32 1 %f10 = extractelement <2 x i64> %1, i32 0 Index: llvm/trunk/test/CodeGen/X86/pmul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pmul.ll +++ llvm/trunk/test/CodeGen/X86/pmul.ll @@ -1318,76 +1318,55 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm5, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pmuludq %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm1, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm4 -; SSE2-NEXT: paddq %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm2, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm6, %xmm4 -; SSE2-NEXT: paddq %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm6 -; SSE2-NEXT: pmuludq %xmm5, %xmm4 -; SSE2-NEXT: paddq %xmm6, %xmm4 -; SSE2-NEXT: pmuludq %xmm5, %xmm3 -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE2-NEXT: pmuludq %xmm9, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pmuludq %xmm11, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE2-NEXT: pmuludq %xmm11, %xmm1 +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm10, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE2-NEXT: pmuludq %xmm10, %xmm2 +; SSE2-NEXT: psllq $32, %xmm2 +; SSE2-NEXT: paddq %xmm6, %xmm2 +; SSE2-NEXT: pmuludq %xmm8, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE2-NEXT: pmuludq %xmm8, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: paddq %xmm7, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: Index: llvm/trunk/test/CodeGen/X86/pr35918.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr35918.ll +++ llvm/trunk/test/CodeGen/X86/pr35918.ll @@ -7,7 +7,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind { ; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: ; X86-SKYLAKE: # %bb.0: # %entry -; X86-SKYLAKE-NEXT: subl $12, %esp +; X86-SKYLAKE-NEXT: subl $8, %esp ; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -21,12 +21,12 @@ ; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx ; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 ; X86-SKYLAKE-NEXT: movl %ecx, (%eax) -; X86-SKYLAKE-NEXT: addl $12, %esp +; X86-SKYLAKE-NEXT: addl $8, %esp ; X86-SKYLAKE-NEXT: retl ; ; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8: ; X86-SKX: # %bb.0: # %entry -; X86-SKX-NEXT: subl $12, %esp +; X86-SKX-NEXT: subl $8, %esp ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u] @@ -35,19 +35,16 @@ ; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; X86-SKX-NEXT: vpmovqw %xmm1, {{[0-9]+}}(%esp) -; X86-SKX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp) -; X86-SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-SKX-NEXT: vpmovdb %xmm0, (%esp) ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: movzwl (%esp), %ecx ; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 ; X86-SKX-NEXT: movl %ecx, (%eax) -; X86-SKX-NEXT: addl $12, %esp +; X86-SKX-NEXT: addl $8, %esp ; X86-SKX-NEXT: retl ; ; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8: @@ -74,13 +71,10 @@ ; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-SKX-NEXT: vpmovqw %xmm1, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) -; X64-SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp) ; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000 Index: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll +++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll @@ -1507,7 +1507,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1643,7 +1643,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -2047,12 +2047,16 @@ ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65536,0] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u> ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: psllq $32, %xmm1 -; X86-SSE-NEXT: paddq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X86-SSE-NEXT: paddq %xmm1, %xmm3 +; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2128,13 +2132,17 @@ ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u> ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psllq $32, %xmm2 -; X86-SSE-NEXT: paddq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X86-SSE-NEXT: paddq %xmm2, %xmm3 +; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2755,23 +2755,13 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { -; X86-SSE-LABEL: test_mm_mul_epu32: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X86-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X86-SSE-NEXT: retl # encoding: [0xc3] +; SSE-LABEL: test_mm_mul_epu32: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] +; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_mul_epu32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc] -; AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc] -; AVX1-NEXT: # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; @@ -2784,16 +2774,6 @@ ; AVX512-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; X64-SSE-LABEL: test_mm_mul_epu32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X64-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X64-SSE-NEXT: retq # encoding: [0xc3] %A = and <2 x i64> %a0, %B = and <2 x i64> %a1, %res = mul nuw <2 x i64> %A, %B Index: llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -832,26 +832,11 @@ define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_mul_epi32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE-NEXT: pmuldq %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmuldq %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} ; Index: llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -143,33 +143,31 @@ define <4 x i32> @test_urem_even_div_nonsplat(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_even_div_nonsplat: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,3435973837,2863311531,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -277,20 +275,17 @@ ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: psrld $3, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrld $2, %xmm4 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [6,10,12,16] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,16] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -382,34 +377,31 @@ define <4 x i32> @test_urem_one_nonsplat(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_one_nonsplat: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,0,2863311531,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,1,12,14] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 Index: llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll +++ llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -627,11 +627,10 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X86-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-NEXT: andps {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/vector-mul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-mul.ll +++ llvm/trunk/test/CodeGen/X86/vector-mul.ll @@ -460,7 +460,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <17,u,65,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -809,7 +809,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <15,u,63,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -845,16 +845,17 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967281,u,4294967233,u> +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_15_63: @@ -889,16 +890,17 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967279,u,4294967231,u> +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_17_65: @@ -933,7 +935,7 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_0_1: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,u,1,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -975,7 +977,7 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1029,7 +1031,7 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm2 = <15,u,4294967233,u> ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1172,7 +1174,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_68_132: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <68,u,132,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -1208,7 +1210,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_60_120: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <60,u,124,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 Index: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll @@ -835,9 +835,8 @@ ; SSE2-NEXT: pmuludq %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -896,34 +895,25 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-LABEL: test_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm6 +; SSE2-NEXT: pmuludq %xmm5, %xmm6 ; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,0,0] +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -992,64 +982,39 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm3 +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm10, %xmm2 +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: Index: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll @@ -5595,40 +5595,29 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: psrad $31, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrad $31, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm0, %xmm7 -; SSE-NEXT: paddq %xmm6, %xmm7 -; SSE-NEXT: psllq $32, %xmm7 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm7, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm3, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: paddq %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE-NEXT: paddd %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE-NEXT: paddd %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: