diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -466,23 +466,58 @@ Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI) { - // TODO: Add anthor Cast Intrinsic, VP_TRUNC/VP_ZEXT + Value *CastOp = nullptr; switch (VPI.getIntrinsicID()) { default: llvm_unreachable("Not a VP memory intrinsic"); - case Intrinsic::vp_inttoptr: { - Value *NewOp = + case Intrinsic::vp_sext: + CastOp = + Builder.CreateSExt(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_zext: + CastOp = + Builder.CreateZExt(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_trunc: + CastOp = + Builder.CreateTrunc(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_inttoptr: + CastOp = Builder.CreateIntToPtr(VPI.getOperand(0), VPI.getType(), VPI.getName()); - replaceOperation(*NewOp, VPI); - return NewOp; - } - case Intrinsic::vp_ptrtoint: { - Value *NewOp = + break; + case Intrinsic::vp_ptrtoint: + CastOp = Builder.CreatePtrToInt(VPI.getOperand(0), VPI.getType(), VPI.getName()); - replaceOperation(*NewOp, VPI); - return NewOp; - } + break; + case Intrinsic::vp_fptosi: + CastOp = + Builder.CreateFPToSI(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + + case Intrinsic::vp_fptoui: + CastOp = + Builder.CreateFPToUI(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_sitofp: + CastOp = + Builder.CreateSIToFP(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_uitofp: + CastOp = + Builder.CreateUIToFP(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_fptrunc: + CastOp = + Builder.CreateFPTrunc(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; + case Intrinsic::vp_fpext: + CastOp = + Builder.CreateFPExt(VPI.getOperand(0), VPI.getType(), VPI.getName()); + break; } + replaceOperation(*CastOp, VPI); + return CastOp; } Value * diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -117,3 +117,458 @@ %v = call <4 x i64> @llvm.vp.ptrtoint.v4i64.v4p0(<4 x ptr> %va, <4 x i1> %m, i32 %evl) ret <4 x i64> %v } + +define <4 x i32> @vsext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vsext_v4i32_v4i1: +; X86: # %bb.0: +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vsext_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vsext_v4i32_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX-NEXT: retq + %v = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i64> @vsext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vsext_v4i64_v4i1: +; X86: # %bb.0: +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpmovsxdq %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: retl +; +; SSE-LABEL: vsext_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vsext_v4i64_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsext_v4i64_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: vsext_v4i64_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: retq + %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i32> @vzext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vzext_v4i32_v4i1: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vzext_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vzext_v4i32_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vzext_v4i32_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: vzext_v4i32_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: retq + %v = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i64> @vzext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vzext_v4i64_v4i1: +; X86: # %bb.0: +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; SSE-LABEL: vzext_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: retq +; +; AVX1-LABEL: vzext_v4i64_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vzext_v4i64_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: vzext_v4i64_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq + %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i1> @vtrunc_v4i1_v4i32(<4 x i32> %a, <4 x i1> %m, i32 zeroext %vl) { +; X86-LABEL: vtrunc_v4i1_v4i32: +; X86: # %bb.0: +; X86-NEXT: retl +; +; SSE-LABEL: vtrunc_v4i1_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: retq +; +; AVX-LABEL: vtrunc_v4i1_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: retq + %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> %a, <4 x i1> %m, i32 %vl) + ret <4 x i1> %v +} + +define <4 x i1> @vtrunc_v4i1_v4i64(<4 x i64> %a, <4 x i1> %m, i32 zeroext %vl) { +; X86-LABEL: vtrunc_v4i1_v4i64: +; X86: # %bb.0: +; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; SSE-LABEL: vtrunc_v4i1_v4i64: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: vtrunc_v4i1_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vtrunc_v4i1_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vtrunc_v4i1_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> %a, <4 x i1> %m, i32 %vl) + ret <4 x i1> %v +} + +define <4 x i32> @vfptoui_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vfptoui_v4i32_v4f32: +; X86: # %bb.0: +; X86-NEXT: vcvttps2dq %xmm0, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm2 +; X86-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 +; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vfptoui_v4i32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vfptoui_v4i32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vfptoui_v4i32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: vfptoui_v4i32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512-NEXT: retq + %v = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vfptosi_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vfptosi_v4i32_v4f32: +; X86: # %bb.0: +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vfptosi_v4i32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vfptosi_v4i32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %v = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x float> @vuitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vuitofp_v4f32_v4i32: +; X86: # %bb.0: +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; X86-NEXT: vpsrld $16, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; X86-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vuitofp_v4f32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vuitofp_v4f32_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vuitofp_v4f32_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: vuitofp_v4f32_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512-NEXT: retq + %v = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) + ret <4 x float> %v +} + +define <4 x float> @vsitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; X86-LABEL: vsitofp_v4f32_v4i32: +; X86: # %bb.0: +; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vsitofp_v4f32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vsitofp_v4f32_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %v = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) + ret <4 x float> %v +} + +define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroext %vl) { +; X86-LABEL: vfptrunc_v2f16_v2f64: +; X86: # %bb.0: +; X86-NEXT: subl $40, %esp +; X86-NEXT: .cfi_def_cfa_offset 44 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: calll __truncdfhf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovhps %xmm0, (%esp) +; X86-NEXT: calll __truncdfhf2 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: addl $40, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; SSE-LABEL: vfptrunc_v2f16_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: .cfi_def_cfa_offset 48 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: callq __truncdfhf2@PLT +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: callq __truncdfhf2@PLT +; SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: .cfi_def_cfa_offset 8 +; SSE-NEXT: retq +; +; AVX1-LABEL: vfptrunc_v2f16_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vfptrunc_v2f16_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $40, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-NEXT: addq $40, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: retq +; +; AVX512-LABEL: vfptrunc_v2f16_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0] +; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq + %v = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> %a, <2 x i1> %m, i32 %vl) + ret <2 x half> %v +} + +define <2 x double> @vfpext_v2f32_v2f64(<2 x float> %a, <2 x i1> %m, i32 zeroext %vl) { +; X86-LABEL: vfpext_v2f32_v2f64: +; X86: # %bb.0: +; X86-NEXT: vcvtps2pd %xmm0, %xmm0 +; X86-NEXT: retl +; +; SSE-LABEL: vfpext_v2f32_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtps2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vfpext_v2f32_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtps2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %v = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> %a, <2 x i1> %m, i32 %vl) + ret <2 x double> %v +} + +declare <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32) +declare <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32) +declare <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32) +declare <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32>, <4 x i1>, i32) +declare <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float>, <4 x i1>, i32) +declare <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32) +declare <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32) +declare <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double>, <2 x i1>, i32) +declare <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float>, <2 x i1>, i32)