Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1016,6 +1016,7 @@ setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } @@ -14707,6 +14708,32 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } + // pre-AVX512 i64 SRA needs to be performed as partial shifts. + const X86Subtarget &Subtarget = + static_cast(DAG.getSubtarget()); + if (VT == MVT::v2i64 && Opc == X86ISD::VSRAI && !Subtarget.hasAVX512()) { + MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + SDValue Ex = DAG.getNode(ISD::BITCAST, dl, ExVT, SrcOp); + + if (ShiftAmt >= 32) { + // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. + SDValue Upper = + getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); + SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt - 32, DAG); + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); + } else { + // SRA upper i32, SHL whole i64 and select lower i32. + SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt, DAG); + SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, SrcOp, + ShiftAmt, DAG); + Lower = DAG.getNode(ISD::BITCAST, dl, ExVT, Lower); + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); + } + return DAG.getNode(ISD::BITCAST, dl, VT, Ex); + } + return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, dl, MVT::i8)); } @@ -16319,7 +16346,7 @@ if (Op.getOpcode() == ISD::SRL) return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); - if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) + if (Op.getOpcode() == ISD::SRA && VT != MVT::v4i64) return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, DAG); } @@ -16522,7 +16549,8 @@ } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && VT == MVT::v2i64 && + if (!Subtarget->is64Bit() && VT == MVT::v2i64 && + (Op.getOpcode() != ISD::SRA || Subtarget->hasAVX512()) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -150,8 +150,8 @@ { ISD::SRA, MVT::v8i32, 1 }, { ISD::SHL, MVT::v2i64, 1 }, { ISD::SRL, MVT::v2i64, 1 }, - { ISD::SHL, MVT::v4i64, 1 }, - { ISD::SRL, MVT::v4i64, 1 }, + { ISD::SHL, MVT::v4i64, 1 }, + { ISD::SRL, MVT::v4i64, 1 }, { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. @@ -208,12 +208,13 @@ { ISD::SRL, MVT::v4i32, 1 }, // psrld. { ISD::SRL, MVT::v2i64, 1 }, // psrlq. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - { ISD::SRA, MVT::v8i16, 1 }, // psraw. - { ISD::SRA, MVT::v4i32, 1 }, // psrad. - - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. + + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; @@ -250,9 +251,9 @@ // For some cases, where the shift amount is a scalar we would be able // to generate better code. Unfortunately, when this is the case the value // (the splat) will get hoisted out of the loop, thereby making it invisible - // to ISel. The cost model must return worst case assumptions because it is - // used for vectorization and we don't want to make vectorized code worse - // than scalar code. + // to ISel. The cost model must return worst case assumptions because it is + // used for vectorization and we don't want to make vectorized code worse + // than scalar code. { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. @@ -268,9 +269,9 @@ { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. - - // It is not a good idea to vectorize division. We have to scalarize it and - // in the process we will often end up having to spilling regular + + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular // registers. The overhead of division is going to dominate most kernels // anyways so try hard to prevent vectorization of division - it is // generally a bad idea. Assume somewhat arbitrarily that we have to be able Index: test/Analysis/CostModel/X86/testshiftashr.ll =================================================================== --- test/Analysis/CostModel/X86/testshiftashr.ll +++ test/Analysis/CostModel/X86/testshiftashr.ll @@ -247,9 +247,9 @@ define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { entry: ; SSE2: shift2i16const - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN: shift2i16const - ; SSE2-CODEGEN: sarq $ + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec %a , ret %shifttypec %0 @@ -320,9 +320,9 @@ define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { entry: ; SSE2: shift2i32c - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN: shift2i32c - ; SSE2-CODEGEN: sarq $3 + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec2i32 %a , ret %shifttypec2i32 %0 @@ -391,9 +391,9 @@ define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { entry: ; SSE2: shift2i64c - ; SSE2: cost of 20 {{.*}} ashr + ; SSE2: cost of 4 {{.*}} ashr ; SSE2-CODEGEN: shift2i64c - ; SSE2-CODEGEN: sarq $3 + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec2i64 %a , ret %shifttypec2i64 %0 @@ -403,9 +403,9 @@ define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { entry: ; SSE2: shift4i64c - ; SSE2: cost of 40 {{.*}} ashr + ; SSE2: cost of 8 {{.*}} ashr ; SSE2-CODEGEN: shift4i64c - ; SSE2-CODEGEN: sarq $3 + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec4i64 %a , ret %shifttypec4i64 %0 @@ -415,9 +415,9 @@ define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { entry: ; SSE2: shift8i64c - ; SSE2: cost of 80 {{.*}} ashr + ; SSE2: cost of 16 {{.*}} ashr ; SSE2-CODEGEN: shift8i64c - ; SSE2-CODEGEN: sarq $3 + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec8i64 %a , @@ -428,9 +428,9 @@ define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { entry: ; SSE2: shift16i64c - ; SSE2: cost of 160 {{.*}} ashr + ; SSE2: cost of 32 {{.*}} ashr ; SSE2-CODEGEN: shift16i64c - ; SSE2-CODEGEN: sarq $3 + ; SSE2-CODEGEN: psrad $3 %0 = ashr %shifttypec16i64 %a , ret %shifttypec2i8 %0 Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -35,13 +35,16 @@ ; SSE2-LABEL: sitofp_2vf64_i32: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: movslq %ecx, %rcx ; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -50,14 +53,16 @@ ; AVX-LABEL: sitofp_2vf64_i32: ; AVX: # BB#0: ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: cltq -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movslq %ecx, %rcx ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %cvt = sitofp <2 x i32> %shuf to <2 x double> @@ -109,23 +114,29 @@ ; SSE2-LABEL: sitofp_4vf64_i32: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd %xmm3, %rax ; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movd %xmm3, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -75,67 +75,49 @@ ; SSE2-LABEL: sext_4i32_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -154,20 +136,16 @@ ; ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # BB#0: # %entry -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; X32-SSE41-NEXT: psllq $32, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm2 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: psllq $32, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl entry: @@ -411,23 +389,17 @@ ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -436,23 +408,17 @@ ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; @@ -460,22 +426,16 @@ ; SSE41: # BB#0: ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -500,20 +460,16 @@ ; X32-SSE41: # BB#0: ; X32-SSE41-NEXT: pslld $31, %xmm0 ; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; X32-SSE41-NEXT: psllq $32, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm2 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: psllq $32, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl %extmask = sext <4 x i1> %mask to <4 x i64> @@ -577,23 +533,17 @@ ; SSE2-NEXT: pslld $24, %xmm0 ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -602,23 +552,17 @@ ; SSSE3-NEXT: pslld $24, %xmm0 ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; @@ -626,22 +570,16 @@ ; SSE41: # BB#0: ; SSE41-NEXT: pslld $24, %xmm0 ; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -666,20 +604,16 @@ ; X32-SSE41: # BB#0: ; X32-SSE41-NEXT: pslld $24, %xmm0 ; X32-SSE41-NEXT: psrad $24, %xmm0 -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; X32-SSE41-NEXT: psllq $32, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm2 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: psllq $32, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl %extmask = sext <4 x i8> %mask to <4 x i64> Index: test/CodeGen/X86/vshift-3.ll =================================================================== --- test/CodeGen/X86/vshift-3.ll +++ test/CodeGen/X86/vshift-3.ll @@ -3,13 +3,12 @@ ; test vector shifts converted to proper SSE2 vector shifts when the shift ; amounts are the same. -; Note that x86 does have ashr +; Note that x86 does have ashr -; shift1a can't use a packed shift define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind { entry: ; CHECK-LABEL: shift1a: -; CHECK: sarl +; CHECK: psrad $31 %ashr = ashr <2 x i64> %val, < i64 32, i64 32 > store <2 x i64> %ashr, <2 x i64>* %dst ret void Index: test/CodeGen/X86/widen_conv-2.ll =================================================================== --- test/CodeGen/X86/widen_conv-2.ll +++ test/CodeGen/X86/widen_conv-2.ll @@ -1,8 +1,9 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: {{cwtl|movswl}} -; CHECK: {{cwtl|movswl}} +; CHECK: psllq $48, %xmm0 +; CHECK: psrad $16, %xmm0 +; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; sign extension v2i32 to v2i16 +; sign extension v2i16 to v2i32 define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind { entry: