Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18546,6 +18546,69 @@ DAG.getBitcast(VT, SV1), Mask, DAG); } +static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) { + unsigned CastOpcode = N->getOperand(0).getOpcode(); + switch (CastOpcode) { + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + // TODO: Allow more opcodes? + // case ISD::BITCAST: + // case ISD::TRUNCATE: + // case ISD::ZERO_EXTEND: + // case ISD::SIGN_EXTEND: + // case ISD::FP_EXTEND: + break; + default: + return SDValue(); + } + + EVT SrcVT = N->getOperand(0).getOperand(0).getValueType(); + if (!SrcVT.isVector()) + return SDValue(); + + // All operands of the concat must be the same kind of cast from the same + // source type. + SmallVector SrcOps; + for (SDValue Op : N->ops()) { + if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() || + Op.getOperand(0).getValueType() != SrcVT) + return SDValue(); + SrcOps.push_back(Op.getOperand(0)); + } + + // The wider cast must be supported by the target. This is unusual because + // the operation support type parameter depends on the opcode. In addition, + // check the other type in the cast to make sure this is really legal. + EVT VT = N->getValueType(0); + EVT SrcEltVT = SrcVT.getVectorElementType(); + unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands(); + EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + switch (CastOpcode) { + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) || + !TLI.isTypeLegal(VT)) + return SDValue(); + break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) || + !TLI.isTypeLegal(ConcatSrcVT)) + return SDValue(); + break; + default: + llvm_unreachable("Unexpected cast opcode"); + } + + // concat (cast X), (cast Y)... -> cast (concat X, Y...) + SDLoc DL(N); + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps); + return DAG.getNode(CastOpcode, DL, VT, NewConcat); +} + SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) @@ -18673,6 +18736,9 @@ if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) return V; + if (SDValue V = combineConcatVectorOfCasts(N, DAG)) + return V; + // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that Index: llvm/test/CodeGen/X86/avx-shift.ll =================================================================== --- llvm/test/CodeGen/X86/avx-shift.ll +++ llvm/test/CodeGen/X86/avx-shift.ll @@ -167,15 +167,14 @@ define <8 x i32> @vshift08(<8 x i32> %a) { ; CHECK-LABEL: vshift08: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $23, %xmm0, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpslld $23, %xmm1, %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: retq %bitop = shl <8 x i32> , %a ret <8 x i32> %bitop Index: llvm/test/CodeGen/X86/concat-cast.ll =================================================================== --- llvm/test/CodeGen/X86/concat-cast.ll +++ llvm/test/CodeGen/X86/concat-cast.ll @@ -8,16 +8,14 @@ define <4 x float> @sitofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; SSE-LABEL: sitofp_v4i32_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sitofp_v4i32_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm1, %xmm1 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %s0 = sitofp <2 x i32> %x to <2 x float> %s1 = sitofp <2 x i32> %y to <2 x float> @@ -28,68 +26,55 @@ define <4 x float> @uitofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; SSE2-LABEL: uitofp_v4i32_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] -; SSE2-NEXT: orpd %xmm3, %xmm0 -; SSE2-NEXT: subpd %xmm3, %xmm0 -; SSE2-NEXT: cvtpd2ps %xmm0, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: orpd %xmm3, %xmm1 -; SSE2-NEXT: subpd %xmm3, %xmm1 -; SSE2-NEXT: cvtpd2ps %xmm1, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: uitofp_v4i32_v4f32: ; SSE4: # %bb.0: -; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] -; SSE4-NEXT: por %xmm2, %xmm0 -; SSE4-NEXT: subpd %xmm2, %xmm0 -; SSE4-NEXT: cvtpd2ps %xmm0, %xmm0 -; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE4-NEXT: por %xmm2, %xmm1 -; SSE4-NEXT: subpd %xmm2, %xmm1 -; SSE4-NEXT: cvtpd2ps %xmm1, %xmm1 -; SSE4-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE4-NEXT: psrld $16, %xmm0 +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE4-NEXT: subps {{.*}}(%rip), %xmm0 +; SSE4-NEXT: addps %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX1-LABEL: uitofp_v4i32_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vsubpd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vsubpd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcvtpd2ps %xmm1, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_v4i32_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] -; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vsubpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vcvtpd2ps %xmm1, %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: uitofp_v4i32_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512-NEXT: vcvtudq2ps %zmm1, %zmm1 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = uitofp <2 x i32> %x to <2 x float> @@ -101,16 +86,14 @@ define <4 x i32> @fptosi_v4f32_v4i32(<2 x float> %x, <2 x float> %y) { ; SSE-LABEL: fptosi_v4f32_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_v4f32_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vcvttps2dq %xmm1, %xmm1 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX-NEXT: retq %s0 = fptosi <2 x float> %x to <2 x i32> %s1 = fptosi <2 x float> %y to <2 x i32> @@ -208,11 +191,9 @@ ; ; AVX512-LABEL: fptoui_v4f32_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512-NEXT: vcvttps2udq %zmm1, %zmm1 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = fptoui <2 x float> %x to <2 x i32> @@ -230,9 +211,8 @@ ; ; AVX-LABEL: sitofp_v4i32_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX-NEXT: retq %s0 = sitofp <2 x i32> %x to <2 x double> %s1 = sitofp <2 x i32> %y to <2 x double> @@ -267,34 +247,27 @@ ; AVX1-LABEL: uitofp_v4i32_v4f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vsubpd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_v4i32_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] -; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vsubpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: uitofp_v4i32_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512-NEXT: vcvtudq2pd %ymm1, %zmm1 -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq %s0 = uitofp <2 x i32> %x to <2 x double> %s1 = uitofp <2 x i32> %y to <2 x double> @@ -312,9 +285,10 @@ ; ; AVX-LABEL: fptosi_v4f64_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vcvttpd2dq %xmm1, %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %s0 = fptosi <2 x double> %x to <2 x i32> %s1 = fptosi <2 x double> %y to <2 x i32> @@ -404,11 +378,10 @@ ; ; AVX512-LABEL: fptoui_v4f64_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512-NEXT: vcvttpd2udq %zmm1, %ymm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = fptoui <2 x double> %x to <2 x i32> @@ -417,6 +390,8 @@ ret <4 x i32> %r } +; Negative test + define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; SSE2-LABEL: mismatch_tofp_v4i32_v4f32: ; SSE2: # %bb.0: @@ -477,6 +452,8 @@ ret <4 x float> %r } +; Negative test + define <4 x float> @sitofp_v4i32_v4f32_extra_use(<2 x i32> %x, <2 x i32> %y, <2 x float>* %p) { ; SSE-LABEL: sitofp_v4i32_v4f32_extra_use: ; SSE: # %bb.0: