diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19029,6 +19029,21 @@ return Res; } + if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && + Op->getSimpleValueType(0) == MVT::v4f64) { + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); + SDValue VBias = + DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v4f64); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, + DAG.getBitcast(MVT::v4i64, VBias)); + Or = DAG.getBitcast(MVT::v4f64, Or); + + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); + } + // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -320,14 +320,10 @@ define <4 x double> @uitofp_v4i32_v4f64(<4 x i32> %x) #0 { ; AVX1-LABEL: uitofp_v4i32_v4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: uitofp_v4i32_v4f64: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -737,13 +737,9 @@ ; ; VEX-LABEL: uitofp_4i32_to_2f64: ; VEX: # %bb.0: -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; VEX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i32_to_2f64: @@ -1046,24 +1042,20 @@ ; AVX1-LABEL: uitofp_4i32_to_4f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_4i32_to_4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i32_to_4f64: @@ -3651,17 +3643,20 @@ ; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_4i32_to_2f64_2: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: @@ -3719,17 +3714,20 @@ ; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: @@ -3990,25 +3988,20 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_4i32_to_4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f64: diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll @@ -159,8 +159,8 @@ ; CHECK-LABEL: test3: ; This test used to crash because we were custom lowering it as if it was ; a conversion between <4 x i32> and <4 x float>. -; AVX: vcvtdq2pd -; AVX2: vcvtdq2pd +; AVX: vsubpd +; AVX2: vsubpd ; CHECK: retq %tmp = uitofp <4 x i32> %arg to <4 x double> ret <4 x double> %tmp diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7287,12 +7287,12 @@ ; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: