diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10327,6 +10327,46 @@ return EmitX86CpuIs(CPUStr); } +// Convert F16 halfs to floats. +static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF, + ArrayRef Ops, + llvm::Type *DstTy) { + assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) && + "Unknown cvtph2ps intrinsic"); + + // If the SAE intrinsic doesn't use default rounding then we can't upgrade. + if (Ops.size() == 4 && cast(Ops[3])->getZExtValue() != 4) { + Intrinsic::ID IID = Intrinsic::x86_avx512_mask_vcvtph2ps_512; + Function *F = + CGF.CGM.getIntrinsic(IID, {DstTy, Ops[0]->getType(), Ops[1]->getType(), + Ops[2]->getType(), Ops[3]->getType()}); + return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]}); + } + + unsigned NumDstElts = DstTy->getVectorNumElements(); + Value *Src = Ops[0]; + + // Extract the subvector. + if (NumDstElts != Src->getType()->getVectorNumElements()) { + assert(NumDstElts == 4 && "Unexpected vector size"); + uint32_t ShuffleMask[4] = {0, 1, 2, 3}; + Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()), + ShuffleMask); + } + + // Bitcast from vXi16 to vXf16. + llvm::Type *HalfTy = llvm::VectorType::get( + llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts); + Src = CGF.Builder.CreateBitCast(Src, HalfTy); + + // Perform the fp-extension. + Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps"); + + if (Ops.size() >= 3) + Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]); + return Res; +} + // Convert a BF16 to a float. static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF, const CallExpr *E, @@ -12531,6 +12571,14 @@ case X86::BI__builtin_ia32_cmpordsd: return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7); + // f16c half2float intrinsics + case X86::BI__builtin_ia32_vcvtph2ps: + case X86::BI__builtin_ia32_vcvtph2ps256: + case X86::BI__builtin_ia32_vcvtph2ps_mask: + case X86::BI__builtin_ia32_vcvtph2ps256_mask: + case X86::BI__builtin_ia32_vcvtph2ps512_mask: + return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType())); + // AVX512 bf16 intrinsics case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { Ops[2] = getMaskVecValue(*this, Ops[2], diff --git a/clang/test/CodeGen/avx512f-builtins-constrained.c b/clang/test/CodeGen/avx512f-builtins-constrained.c --- a/clang/test/CodeGen/avx512f-builtins-constrained.c +++ b/clang/test/CodeGen/avx512f-builtins-constrained.c @@ -171,21 +171,32 @@ __m512 test_mm512_cvtph_ps (__m256i __A) { // COMMON-LABEL: test_mm512_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512 + // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16> + // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half> + // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float> + // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict") return _mm512_cvtph_ps (__A); } __m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) { // COMMON-LABEL: test_mm512_mask_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512 + // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16> + // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half> + // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float> + // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_cvtph_ps (__W,__U,__A); } __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) { // COMMON-LABEL: test_mm512_maskz_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512 + // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16> + // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half> + // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float> + // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_cvtph_ps (__U,__A); } diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c --- a/clang/test/CodeGen/avx512f-builtins.c +++ b/clang/test/CodeGen/avx512f-builtins.c @@ -9463,21 +9463,29 @@ __m512 test_mm512_cvtph_ps (__m256i __A) { // CHECK-LABEL: @test_mm512_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512 + // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half> + // CHECK: fpext <16 x half> %{{.*}} to <16 x float> return _mm512_cvtph_ps (__A); } __m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) { // CHECK-LABEL: @test_mm512_mask_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512 + // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half> + // CHECK: fpext <16 x half> %{{.*}} to <16 x float> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_cvtph_ps (__W,__U,__A); } __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) { // CHECK-LABEL: @test_mm512_maskz_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512 + // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half> + // CHECK: fpext <16 x half> %{{.*}} to <16 x float> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_cvtph_ps (__U,__A); } diff --git a/clang/test/CodeGen/avx512vl-builtins-constrained.c b/clang/test/CodeGen/avx512vl-builtins-constrained.c --- a/clang/test/CodeGen/avx512vl-builtins-constrained.c +++ b/clang/test/CodeGen/avx512vl-builtins-constrained.c @@ -8,25 +8,43 @@ __m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { // COMMON-LABEL: @test_mm_mask_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.128 + // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16> + // COMMONIR: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // COMMONIR: bitcast <4 x i16> %{{.*}} to <4 x half> + // UNCONSTRAINED: fpext <4 x half> %{{.*}} to <4 x float> + // CONSTRAINED: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_cvtph_ps(__W, __U, __A); } __m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { // COMMON-LABEL: @test_mm_maskz_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.128 + // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16> + // COMMONIR: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // COMMONIR: bitcast <4 x i16> %{{.*}} to <4 x half> + // UNCONSTRAINED: fpext <4 x half> %{{.*}} to <4 x float> + // CONSTRAINED: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_cvtph_ps(__U, __A); } __m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { // COMMON-LABEL: @test_mm256_mask_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.256 + // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16> + // COMMONIR: bitcast <8 x i16> %{{.*}} to <8 x half> + // UNCONSTRAINED: fpext <8 x half> %{{.*}} to <8 x float> + // CONSTRAINED: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_cvtph_ps(__W, __U, __A); } __m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { // COMMON-LABEL: @test_mm256_maskz_cvtph_ps - // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.256 + // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16> + // COMMONIR: bitcast <8 x i16> %{{.*}} to <8 x half> + // UNCONSTRAINED: fpext <8 x half> %{{.*}} to <8 x float> + // CONSTRAINED: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict") + // COMMONIR: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_cvtph_ps(__U, __A); } diff --git a/clang/test/CodeGen/avx512vl-builtins.c b/clang/test/CodeGen/avx512vl-builtins.c --- a/clang/test/CodeGen/avx512vl-builtins.c +++ b/clang/test/CodeGen/avx512vl-builtins.c @@ -9692,25 +9692,39 @@ __m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: @test_mm_mask_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128 + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: fpext <4 x half> %{{.*}} to <4 x float> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_cvtph_ps(__W, __U, __A); } __m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { // CHECK-LABEL: @test_mm_maskz_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128 + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: fpext <4 x half> %{{.*}} to <4 x float> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_cvtph_ps(__U, __A); } __m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: @test_mm256_mask_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256 + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half> + // CHECK: fpext <8 x half> %{{.*}} to <8 x float> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_cvtph_ps(__W, __U, __A); } __m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { // CHECK-LABEL: @test_mm256_maskz_cvtph_ps - // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256 + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half> + // CHECK: fpext <8 x half> %{{.*}} to <8 x float> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_cvtph_ps(__U, __A); } diff --git a/clang/test/CodeGen/f16c-builtins-constrained.c b/clang/test/CodeGen/f16c-builtins-constrained.c --- a/clang/test/CodeGen/f16c-builtins-constrained.c +++ b/clang/test/CodeGen/f16c-builtins-constrained.c @@ -13,7 +13,9 @@ // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5 // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6 // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7 - // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}}) + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") // CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _cvtsh_ss(a); } @@ -34,13 +36,18 @@ __m128 test_mm_cvtph_ps(__m128i a) { // CHECK-LABEL: test_mm_cvtph_ps - // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}}) + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") return _mm_cvtph_ps(a); } __m256 test_mm256_cvtph_ps(__m128i a) { // CHECK-LABEL: test_mm256_cvtph_ps - // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}}) + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half> + // CHECK: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict") return _mm256_cvtph_ps(a); } diff --git a/clang/test/CodeGen/f16c-builtins.c b/clang/test/CodeGen/f16c-builtins.c --- a/clang/test/CodeGen/f16c-builtins.c +++ b/clang/test/CodeGen/f16c-builtins.c @@ -13,7 +13,9 @@ // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5 // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6 // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7 - // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}}) + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: fpext <4 x half> %{{.*}} to <4 x float> // CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _cvtsh_ss(a); } @@ -31,13 +33,18 @@ __m128 test_mm_cvtph_ps(__m128i a) { // CHECK-LABEL: test_mm_cvtph_ps - // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}}) + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> + // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half> + // CHECK: fpext <4 x half> %{{.*}} to <4 x float> return _mm_cvtph_ps(a); } __m256 test_mm256_cvtph_ps(__m128i a) { // CHECK-LABEL: test_mm256_cvtph_ps - // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}}) + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half> + // CHECK: fpext <8 x half> %{{.*}} to <8 x float> return _mm256_cvtph_ps(a); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2546,26 +2546,16 @@ // Half float conversion let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps">, - Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">, Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>; def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">, Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>; - def int_x86_avx512_mask_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512_mask">, + def int_x86_avx512_mask_vcvtph2ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>]>; - def int_x86_avx512_mask_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256_mask">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty, llvm_v8f32_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps_mask">, - Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty, llvm_v4f32_ty, - llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16i16_ty, llvm_i16_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -204,6 +204,8 @@ Name.startswith("avx512.mask.cvtqq2pd.") || // Added in 7.0 updated 9.0 Name.startswith("avx512.mask.cvtuqq2pd.") || // Added in 7.0 updated 9.0 Name.startswith("avx512.mask.cvtdq2ps.") || // Added in 7.0 updated 9.0 + Name == "avx512.mask.vcvtph2ps.128" || // Added in 11.0 + Name == "avx512.mask.vcvtph2ps.256" || // Added in 11.0 Name == "avx512.mask.cvtqq2ps.256" || // Added in 9.0 Name == "avx512.mask.cvtqq2ps.512" || // Added in 9.0 Name == "avx512.mask.cvtuqq2ps.256" || // Added in 9.0 @@ -316,6 +318,7 @@ Name == "avx.cvtdq2.pd.256" || // Added in 3.9 Name == "avx.cvtdq2.ps.256" || // Added in 7.0 Name == "avx.cvt.ps2.pd.256" || // Added in 3.9 + Name.startswith("vcvtph2ps.") || // Added in 11.0 Name.startswith("avx.vinsertf128.") || // Added in 3.7 Name == "avx2.vinserti128" || // Added in 3.7 Name.startswith("avx512.mask.insert") || // Added in 4.0 @@ -2132,6 +2135,23 @@ : Builder.CreateSIToFP(Rep, DstTy, "cvt"); } + if (CI->getNumArgOperands() >= 3) + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); + } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") || + Name.startswith("vcvtph2ps."))) { + Type *DstTy = CI->getType(); + Rep = CI->getArgOperand(0); + Type *SrcTy = Rep->getType(); + unsigned NumDstElts = DstTy->getVectorNumElements(); + if (NumDstElts != SrcTy->getVectorNumElements()) { + assert(NumDstElts == 4 && "Unexpected vector size"); + uint32_t ShuffleMask[4] = {0, 1, 2, 3}; + Rep = Builder.CreateShuffleVector(Rep, Rep, ShuffleMask); + } + Rep = Builder.CreateBitCast( + Rep, VectorType::get(Type::getHalfTy(C), NumDstElts)); + Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps"); if (CI->getNumArgOperands() >= 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -783,10 +783,6 @@ X86ISD::FSUBS, X86ISD::FSUBS_RND), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK, X86ISD::FSUBS, X86ISD::FSUBS_RND), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE, X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK, @@ -1108,8 +1104,6 @@ X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2543,50 +2543,6 @@ } break; - case Intrinsic::x86_vcvtph2ps_128: - case Intrinsic::x86_vcvtph2ps_256: { - auto Arg = II->getArgOperand(0); - auto ArgType = cast(Arg->getType()); - auto RetType = cast(II->getType()); - unsigned ArgWidth = ArgType->getNumElements(); - unsigned RetWidth = RetType->getNumElements(); - assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); - assert(ArgType->isIntOrIntVectorTy() && - ArgType->getScalarSizeInBits() == 16 && - "CVTPH2PS input type should be 16-bit integer vector"); - assert(RetType->getScalarType()->isFloatTy() && - "CVTPH2PS output type should be 32-bit float vector"); - - // Constant folding: Convert to generic half to single conversion. - if (isa(Arg)) - return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); - - if (isa(Arg)) { - auto VectorHalfAsShorts = Arg; - if (RetWidth < ArgWidth) { - SmallVector SubVecMask; - for (unsigned i = 0; i != RetWidth; ++i) - SubVecMask.push_back((int)i); - VectorHalfAsShorts = Builder.CreateShuffleVector( - Arg, UndefValue::get(ArgType), SubVecMask); - } - - auto VectorHalfType = - VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); - auto VectorHalfs = - Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType); - auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType); - return replaceInstUsesWith(*II, VectorFloats); - } - - // We only use the lowest lanes of the argument. - if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { - II->setArgOperand(0, V); - return II; - } - break; - } - case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse_cvttss2si: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1907,6 +1907,62 @@ ret <4 x float> %vecins.i } +define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) { +; CHECK-LABEL: test_mm512_cvtph_ps: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = bitcast <4 x i64> %__A to <16 x i16> + %1 = bitcast <16 x i16> %0 to <16 x half> + %2 = fpext <16 x half> %1 to <16 x float> + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) { +; X86-LABEL: test_mm512_mask_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__A to <16 x i16> + %1 = bitcast <16 x i16> %0 to <16 x half> + %2 = bitcast i16 %__U to <16 x i1> + %3 = fpext <16 x half> %1 to <16 x float> + %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W + ret <16 x float> %4 +} + +define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) { +; X86-LABEL: test_mm512_maskz_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__A to <16 x i16> + %1 = bitcast <16 x i16> %0 to <16 x half> + %2 = bitcast i16 %__U to <16 x i1> + %3 = fpext <16 x half> %1 to <16 x float> + %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer + ret <16 x float> %4 +} + define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) { ; CHECK-LABEL: test_mm512_cvtps_pd: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -4512,6 +4512,76 @@ ret <8 x double> %res2 } +define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_512_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x13,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_512_rrk: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_512_rrk: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x13,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x13,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_512_rrkz: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x13,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_512_rrkz: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x13,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly + define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: test_valign_q: ; CHECK: ## %bb.0: @@ -4633,14 +4703,14 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI211_0, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_0, kind: FK_Data_4 ; X86-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI211_1, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_1, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X86-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI211_2, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_2, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -4649,14 +4719,14 @@ ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI211_0-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_0-4, kind: reloc_riprel_4byte ; X64-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI211_1-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_1-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI211_2-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_2-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1007,76 +1007,6 @@ } declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone -define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { -; CHECK-LABEL: test_x86_vcvtph2ps_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) { -; CHECK-LABEL: test_x86_vcvtph2ps_512_sae: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) { -; X64-LABEL: test_x86_vcvtph2ps_512_rrk: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} -; X64-NEXT: vmovaps %zmm1, %zmm0 -; X64-NEXT: retq -; -; X86-LABEL: test_x86_vcvtph2ps_512_rrk: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} -; X86-NEXT: vmovaps %zmm1, %zmm0 -; X86-NEXT: retl - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) { -; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} -; X64-NEXT: retq -; -; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} -; X86-NEXT: retl - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) { -; X64-LABEL: test_x86_vcvtph2ps_512_rrkz: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} -; X64-NEXT: retq -; -; X86-LABEL: test_x86_vcvtph2ps_512_rrkz: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} -; X86-NEXT: retl - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly - define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) { ; X64-LABEL: test_x86_vcvtps2ph_256: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -365,6 +365,98 @@ ret <2 x i64> %1 } +define <4 x float> @test_mm_mask_cvtph_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm_mask_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__A to <8 x i16> + %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> + %2 = bitcast <4 x i16> %1 to <4 x half> + %3 = bitcast i8 %__U to <8 x i1> + %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %5 = fpext <4 x half> %2 to <4 x float> + %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> %__W + ret <4 x float> %6 +} + +define <4 x float> @test_mm_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm_maskz_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__A to <8 x i16> + %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> + %2 = bitcast <4 x i16> %1 to <4 x half> + %3 = bitcast i8 %__U to <8 x i1> + %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %5 = fpext <4 x half> %2 to <4 x float> + %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> zeroinitializer + ret <4 x float> %6 +} + +define <8 x float> @test_mm256_mask_cvtph_ps(<8 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm256_mask_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__A to <8 x i16> + %1 = bitcast <8 x i16> %0 to <8 x half> + %2 = bitcast i8 %__U to <8 x i1> + %3 = fpext <8 x half> %1 to <8 x float> + %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> %__W + ret <8 x float> %4 +} + +define <8 x float> @test_mm256_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm256_maskz_cvtph_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_cvtph_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__A to <8 x i16> + %1 = bitcast <8 x i16> %0 to <8 x half> + %2 = bitcast i8 %__U to <8 x i1> + %3 = fpext <8 x half> %1 to <8 x float> + %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> zeroinitializer + ret <8 x float> %4 +} + define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { ; X86-LABEL: test_mm_mask_cvtps_epi32: ; X86: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -10247,6 +10247,100 @@ ret <8 x float> %res2 } +define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_128_rrk: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_128_rrk: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_128_rrkz: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_128_rrkz: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly + +define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_vcvtph2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_256_rrk: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_256_rrk: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) { +; X86-LABEL: test_x86_vcvtph2ps_256_rrkz: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_256_rrkz: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly + declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4214,101 +4214,6 @@ ret <4 x i64> %res2 } -define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_vcvtph2ps_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1) - ret <4 x float> %res -} - -define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) { -; X86-LABEL: test_x86_vcvtph2ps_128_rrk: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_x86_vcvtph2ps_128_rrk: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask) - ret <4 x float> %res -} - - -define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) { -; X86-LABEL: test_x86_vcvtph2ps_128_rrkz: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_x86_vcvtph2ps_128_rrkz: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask) - ret <4 x float> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly - -define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_vcvtph2ps_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1) - ret <8 x float> %res -} - -define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) { -; X86-LABEL: test_x86_vcvtph2ps_256_rrk: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_x86_vcvtph2ps_256_rrk: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) - ret <8 x float> %res -} - -define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) { -; X86-LABEL: test_x86_vcvtph2ps_256_rrkz: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_x86_vcvtph2ps_256_rrkz: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask) - ret <8 x float> %res -} - -declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly - define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) { ; X86-LABEL: test_x86_vcvtps2ph_128: ; X86: # %bb.0: diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll @@ -30,7 +30,9 @@ %ins5 = insertelement <8 x i16> %ins4, i16 0, i32 5 %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6 %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7 - %cvt = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %ins7) + %shuffle = shufflevector <8 x i16> %ins7, <8 x i16> undef, <4 x i32> + %bc = bitcast <4 x i16> %shuffle to <4 x half> + %cvt = fpext <4 x half> %bc to <4 x float> %res = extractelement <4 x float> %cvt, i32 0 ret float %res } @@ -74,7 +76,9 @@ ; X64-NEXT: vcvtph2ps %xmm0, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %arg0) + %shuffle = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> + %bc = bitcast <4 x i16> %shuffle to <4 x half> + %res = fpext <4 x half> %bc to <4 x float> ret <4 x float> %res } @@ -89,7 +93,8 @@ ; X64-NEXT: vcvtph2ps %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %arg0) + %bc = bitcast <8 x i16> %arg0 to <8 x half> + %res = fpext <8 x half> %bc to <8 x float> ret <8 x float> %res } diff --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll @@ -5,14 +5,16 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) ; -; Vector Demanded Bits +; Vector Demanded Elts ; ; Only bottom 4 elements required. define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) { ; CHECK-LABEL: @demand_vcvtph2ps_128( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> [[A:%.*]]) -; CHECK-NEXT: ret <4 x float> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <4 x half> +; CHECK-NEXT: [[CVTPH2PS:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> +; CHECK-NEXT: ret <4 x float> [[CVTPH2PS]] ; %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1) @@ -23,8 +25,9 @@ define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) { ; CHECK-LABEL: @demand_vcvtph2ps_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> [[TMP1]]) -; CHECK-NEXT: ret <8 x float> [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half> +; CHECK-NEXT: [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float> +; CHECK-NEXT: ret <8 x float> [[CVTPH2PS]] ; %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)