Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17452,6 +17452,19 @@ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // i64 vector arithmetic shift can be emulated with the transform: + // M = SIGN_BIT u>> A + // R s>> a === ((R u>> A) ^ M) - M + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -163,7 +163,8 @@ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, @@ -270,7 +271,7 @@ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -94,7 +94,7 @@ ; AVX2: cost of 1 {{.*}} ashr %C0 = ashr <4 x i32> undef, undef ; AVX: cost of 6 {{.*}} ashr - ; AVX2: cost of 20 {{.*}} ashr + ; AVX2: cost of 4 {{.*}} ashr %C1 = ashr <2 x i64> undef, undef ret void @@ -121,7 +121,7 @@ ; AVX2: cost of 1 {{.*}} ashr %C0 = ashr <8 x i32> undef, undef ; AVX: cost of 12 {{.*}} ashr - ; AVX2: cost of 40 {{.*}} ashr + ; AVX2: cost of 4 {{.*}} ashr %C1 = ashr <4 x i64> undef, undef ret void Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -5,9 +5,9 @@ define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { entry: ; SSE2: shift2i16 - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 12 {{.*}} ashr ; SSE2-CODEGEN: shift2i16 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype %a , %b ret %shifttype %0 @@ -65,9 +65,9 @@ define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { entry: ; SSE2: shift2i32 - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 12 {{.*}} ashr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -125,9 +125,9 @@ define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { entry: ; SSE2: shift2i64 - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 12 {{.*}} ashr ; SSE2-CODEGEN: shift2i64 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype2i64 %a , %b ret %shifttype2i64 %0 @@ -137,9 +137,9 @@ define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { entry: ; SSE2: shift4i64 - ; SSE2: cost of 40 {{.*}} ashr + ; SSE2: cost of 24 {{.*}} ashr ; SSE2-CODEGEN: shift4i64 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype4i64 %a , %b ret %shifttype4i64 %0 @@ -149,9 +149,9 @@ define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { entry: ; SSE2: shift8i64 - ; SSE2: cost of 80 {{.*}} ashr + ; SSE2: cost of 48 {{.*}} ashr ; SSE2-CODEGEN: shift8i64 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype8i64 %a , %b ret %shifttype8i64 %0 @@ -161,9 +161,9 @@ define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { entry: ; SSE2: shift16i64 - ; SSE2: cost of 160 {{.*}} ashr + ; SSE2: cost of 96 {{.*}} ashr ; SSE2-CODEGEN: shift16i64 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype16i64 %a , %b ret %shifttype16i64 %0 @@ -173,9 +173,9 @@ define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { entry: ; SSE2: shift32i64 - ; SSE2: cost of 320 {{.*}} ashr + ; SSE2: cost of 192 {{.*}} ashr ; SSE2-CODEGEN: shift32i64 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype32i64 %a , %b ret %shifttype32i64 %0 @@ -185,9 +185,9 @@ define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { entry: ; SSE2: shift2i8 - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 12 {{.*}} ashr ; SSE2-CODEGEN: shift2i8 - ; SSE2-CODEGEN: sarq %cl + ; SSE2-CODEGEN: psrlq %0 = ashr %shifttype2i8 %a , %b ret %shifttype2i8 %0 Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -13,89 +13,76 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: var_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm1, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq %xmm3, %xmm4 +; SSE2-NEXT: psrlq %xmm1, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlq %xmm3, %xmm2 +; SSE2-NEXT: psrlq %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: xorpd %xmm4, %xmm2 +; SSE2-NEXT: psubq %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movd %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlq %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE41-NEXT: psrlq %xmm4, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlq %xmm1, %xmm3 +; SSE41-NEXT: psrlq %xmm4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v2i64: -; AVX: # BB#0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: sarq %cl, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: sarq %cl, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pushl %ebp -; X32-SSE-NEXT: pushl %ebx -; X32-SSE-NEXT: pushl %edi -; X32-SSE-NEXT: pushl %esi -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; X32-SSE-NEXT: movd %xmm2, %edx -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X32-SSE-NEXT: movd %xmm2, %esi ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; X32-SSE-NEXT: movd %xmm2, %eax -; X32-SSE-NEXT: movb %al, %cl -; X32-SSE-NEXT: shrdl %cl, %edx, %esi -; X32-SSE-NEXT: movd %xmm0, %edi -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: movd %xmm0, %ebx -; X32-SSE-NEXT: movd %xmm1, %ecx -; X32-SSE-NEXT: shrdl %cl, %ebx, %edi -; X32-SSE-NEXT: movl %ebx, %ebp -; X32-SSE-NEXT: sarl %cl, %ebp -; X32-SSE-NEXT: sarl $31, %ebx -; X32-SSE-NEXT: testb $32, %cl -; X32-SSE-NEXT: cmovnel %ebp, %edi -; X32-SSE-NEXT: movd %edi, %xmm0 -; X32-SSE-NEXT: cmovel %ebp, %ebx -; X32-SSE-NEXT: movl %edx, %edi -; X32-SSE-NEXT: movb %al, %cl -; X32-SSE-NEXT: sarl %cl, %edi -; X32-SSE-NEXT: sarl $31, %edx -; X32-SSE-NEXT: testb $32, %al -; X32-SSE-NEXT: cmovnel %edi, %esi -; X32-SSE-NEXT: movd %esi, %xmm1 -; X32-SSE-NEXT: movd %ebx, %xmm2 -; X32-SSE-NEXT: cmovel %edi, %edx -; X32-SSE-NEXT: movd %edx, %xmm3 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: popl %esi -; X32-SSE-NEXT: popl %edi -; X32-SSE-NEXT: popl %ebx -; X32-SSE-NEXT: popl %ebp +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: psrlq %xmm2, %xmm4 +; X32-SSE-NEXT: movq {{.*#+}} xmm5 = xmm1[0],zero +; X32-SSE-NEXT: psrlq %xmm5, %xmm3 +; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlq %xmm2, %xmm1 +; X32-SSE-NEXT: psrlq %xmm5, %xmm0 +; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X32-SSE-NEXT: xorpd %xmm4, %xmm1 +; X32-SSE-NEXT: psubq %xmm4, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i64> %a, %b ret <2 x i64> %shift @@ -516,109 +503,32 @@ ; define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i64: -; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm2, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i64: -; SSE41: # BB#0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movd %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v2i64: -; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: retq +; SSE-LABEL: splatvar_shift_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE-NEXT: psrlq %xmm1, %xmm2 +; SSE-NEXT: psrlq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: retq ; -; AVX2-LABEL: splatvar_shift_v2i64: -; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pushl %ebp -; X32-SSE-NEXT: pushl %ebx -; X32-SSE-NEXT: pushl %edi -; X32-SSE-NEXT: pushl %esi -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; X32-SSE-NEXT: movd %xmm2, %edx -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X32-SSE-NEXT: movd %xmm2, %esi -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; X32-SSE-NEXT: movd %xmm2, %eax -; X32-SSE-NEXT: movb %al, %cl -; X32-SSE-NEXT: shrdl %cl, %edx, %esi -; X32-SSE-NEXT: movd %xmm0, %edi -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: movd %xmm0, %ebx -; X32-SSE-NEXT: movd %xmm1, %ecx -; X32-SSE-NEXT: shrdl %cl, %ebx, %edi -; X32-SSE-NEXT: movl %ebx, %ebp -; X32-SSE-NEXT: sarl %cl, %ebp -; X32-SSE-NEXT: sarl $31, %ebx -; X32-SSE-NEXT: testb $32, %cl -; X32-SSE-NEXT: cmovnel %ebp, %edi -; X32-SSE-NEXT: movd %edi, %xmm0 -; X32-SSE-NEXT: cmovel %ebp, %ebx -; X32-SSE-NEXT: movl %edx, %edi -; X32-SSE-NEXT: movb %al, %cl -; X32-SSE-NEXT: sarl %cl, %edi -; X32-SSE-NEXT: sarl $31, %edx -; X32-SSE-NEXT: testb $32, %al -; X32-SSE-NEXT: cmovnel %edi, %esi -; X32-SSE-NEXT: movd %esi, %xmm1 -; X32-SSE-NEXT: movd %ebx, %xmm2 -; X32-SSE-NEXT: cmovel %edi, %edx -; X32-SSE-NEXT: movd %edx, %xmm3 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE-NEXT: popl %esi -; X32-SSE-NEXT: popl %edi -; X32-SSE-NEXT: popl %ebx -; X32-SSE-NEXT: popl %ebp +; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: psrlq %xmm1, %xmm2 +; X32-SSE-NEXT: psrlq %xmm1, %xmm0 +; X32-SSE-NEXT: pxor %xmm2, %xmm0 +; X32-SSE-NEXT: psubq %xmm2, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i64> %a, %splat @@ -926,59 +836,63 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq $7, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $7, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm0 = [4611686018427387904,72057594037927936] +; SSE2-NEXT: xorpd %xmm0, %xmm1 +; SSE2-NEXT: psubq %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: sarq $7, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: sarq %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlq $7, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubq %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: constant_shift_v2i64: -; AVX: # BB#0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: sarq $7, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: sarq %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: constant_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE-NEXT: movd %xmm1, %eax -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X32-SSE-NEXT: movd %xmm1, %ecx -; X32-SSE-NEXT: shrdl $7, %ecx, %eax -; X32-SSE-NEXT: movd %eax, %xmm1 -; X32-SSE-NEXT: movd %xmm0, %eax -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: movd %xmm0, %edx -; X32-SSE-NEXT: shrdl $1, %edx, %eax -; X32-SSE-NEXT: movd %eax, %xmm0 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: sarl $7, %ecx -; X32-SSE-NEXT: movd %ecx, %xmm1 -; X32-SSE-NEXT: sarl %edx -; X32-SSE-NEXT: movd %edx, %xmm2 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE-NEXT: movl $7, %eax +; X32-SSE-NEXT: movd %eax, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm1, %xmm3 +; X32-SSE-NEXT: psrlq %xmm2, %xmm3 +; X32-SSE-NEXT: movl $1, %eax +; X32-SSE-NEXT: movd %eax, %xmm4 +; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psrlq %xmm2, %xmm1 +; X32-SSE-NEXT: psrlq %xmm4, %xmm0 +; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X32-SSE-NEXT: xorpd %xmm3, %xmm1 +; X32-SSE-NEXT: psubq %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i64> %a, ret <2 x i64> %shift Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -8,52 +8,37 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpextrq $1, %xmm3, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vmovq %xmm3, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm4 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = ashr <4 x i64> %a, %b ret <4 x i64> %shift @@ -245,55 +230,25 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: sarq %cl, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vmovq %xmm2, %rsi -; AVX1-NEXT: vmovq %xmm1, %rdx -; AVX1-NEXT: movb %dl, %cl -; AVX1-NEXT: sarq %cl, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: sarq %cl, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movb %dl, %cl -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm4 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i64> %a, %splat @@ -437,41 +392,27 @@ ; AVX1-LABEL: constant_shift_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: sarq $62, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: sarq $31, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: sarq $7, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: sarq %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: sarq $62, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: sarq $31, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: sarq $7, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: sarq %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = ashr <4 x i64> %a, ret <4 x i64> %shift