Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23434,12 +23434,6 @@ return R; } - // If possible, lower this packed shift into a vector multiply instead of - // expanding it into a sequence of scalar shifts. - if (Op.getOpcode() == ISD::SHL) - if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) - return DAG.getNode(ISD::MUL, dl, VT, R, Scale); - // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: @@ -23485,6 +23479,12 @@ } } + // If possible, lower this packed shift into a vector multiply instead of + // expanding it into a sequence of scalar shifts. + if (Op.getOpcode() == ISD::SHL) + if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) + return DAG.getNode(ISD::MUL, dl, VT, R, Scale); + // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -212,8 +212,14 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 -; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslld $30, %xmm2 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pslld $28, %xmm2 +; SSE-NEXT: pslld $29, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_ext_shl1: Index: test/CodeGen/X86/vec_shift6.ll =================================================================== --- test/CodeGen/X86/vec_shift6.ll +++ test/CodeGen/X86/vec_shift6.ll @@ -71,7 +71,10 @@ define <4 x i32> @test4(<4 x i32> %a) { ; SSE-LABEL: test4: ; SSE: # %bb.0: -; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $1, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: test4: Index: test/CodeGen/X86/widen_arith-4.ll =================================================================== --- test/CodeGen/X86/widen_arith-4.ll +++ test/CodeGen/X86/widen_arith-4.ll @@ -14,7 +14,6 @@ ; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u> -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u> ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %forbody @@ -23,10 +22,13 @@ ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: shlq $4, %rax ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movdqa (%rdx,%rax), %xmm2 -; CHECK-NEXT: psubw %xmm0, %xmm2 -; CHECK-NEXT: pmullw %xmm1, %xmm2 -; CHECK-NEXT: pextrw $4, %xmm2, 8(%rcx,%rax) +; CHECK-NEXT: movdqa (%rdx,%rax), %xmm1 +; CHECK-NEXT: psubw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psllw $2, %xmm2 +; CHECK-NEXT: psllw $1, %xmm1 +; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax) ; CHECK-NEXT: movq %xmm2, (%rcx,%rax) ; CHECK-NEXT: incl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: .LBB0_1: # %forcond