Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -8264,6 +8264,26 @@ return EmitX86Select(CGF, Ops[3], Res, Ops[2]); } +static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, + ArrayRef Ops) { + llvm::Type *Ty = Ops[0]->getType(); + // Arguments have a vXi32 type so cast to vXi64. + llvm::Type *CastTy = llvm::VectorType::get(CGF.Int64Ty, + Ty->getVectorNumElements() / 2); + Value *LHS = CGF.Builder.CreateBitCast(Ops[0], CastTy); + Value *RHS = CGF.Builder.CreateBitCast(Ops[1], CastTy); + + // Truncate and then extend. + llvm::Type *TruncTy = llvm::VectorType::get(CGF.Int32Ty, + Ty->getVectorNumElements() / 2); + LHS = CGF.Builder.CreateTrunc(LHS, TruncTy); + RHS = CGF.Builder.CreateTrunc(RHS, TruncTy); + LHS = CGF.Builder.CreateIntCast(LHS, CastTy, IsSigned); + RHS = CGF.Builder.CreateIntCast(RHS, CastTy, IsSigned); + + return CGF.Builder.CreateMul(LHS, RHS); +} + static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, llvm::Type *DstTy) { unsigned NumberOfElements = DstTy->getVectorNumElements(); @@ -8968,6 +8988,16 @@ case X86::BI__builtin_ia32_pminuq512_mask: return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops); + case X86::BI__builtin_ia32_pmuludq128: + case X86::BI__builtin_ia32_pmuludq256: + case X86::BI__builtin_ia32_pmuludq512: + return EmitX86Muldq(*this, /*IsSigned*/false, Ops); + + case X86::BI__builtin_ia32_pmuldq128: + case X86::BI__builtin_ia32_pmuldq256: + case X86::BI__builtin_ia32_pmuldq512: + return EmitX86Muldq(*this, /*IsSigned*/true, Ops); + // 3DNow! case X86::BI__builtin_ia32_pswapdsf: case X86::BI__builtin_ia32_pswapdsi: { Index: test/CodeGen/avx2-builtins.c =================================================================== --- test/CodeGen/avx2-builtins.c +++ test/CodeGen/avx2-builtins.c @@ -835,13 +835,21 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mul_epi32 - // CHECK: call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + // CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + // CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + // CHECK: mul <4 x i64> %{{.*}}, %{{.*}} return _mm256_mul_epi32(a, b); } __m256i test_mm256_mul_epu32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mul_epu32 - // CHECK: call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + // CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + // CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + // CHECK: mul <4 x i64> %{{.*}}, %{{.*}} return _mm256_mul_epu32(a, b); } Index: test/CodeGen/avx512f-builtins.c =================================================================== --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -1874,13 +1874,21 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) { //CHECK-LABEL: @test_mm512_mul_epi32 - //CHECK: @llvm.x86.avx512.pmul.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} return _mm512_mul_epi32(__A,__B); } __m512i test_mm512_maskz_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: @test_mm512_maskz_mul_epi32 - //CHECK: @llvm.x86.avx512.pmul.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epi32(__k,__A,__B); } @@ -1888,20 +1896,32 @@ __m512i test_mm512_mask_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: @test_mm512_mask_mul_epi32 - //CHECK: @llvm.x86.avx512.pmul.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: sext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epi32(__src,__k,__A,__B); } __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) { //CHECK-LABEL: @test_mm512_mul_epu32 - //CHECK: @llvm.x86.avx512.pmulu.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} return _mm512_mul_epu32(__A,__B); } __m512i test_mm512_maskz_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: @test_mm512_maskz_mul_epu32 - //CHECK: @llvm.x86.avx512.pmulu.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epu32(__k,__A,__B); } @@ -1909,7 +1929,11 @@ __m512i test_mm512_mask_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: @test_mm512_mask_mul_epu32 - //CHECK: @llvm.x86.avx512.pmulu.dq.512 + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: zext <8 x i32> %{{.*}} to <8 x i64> + //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epu32(__src,__k,__A,__B); } Index: test/CodeGen/avx512vl-builtins.c =================================================================== --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -727,14 +727,22 @@ __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: @test_mm256_mask_mul_epi32 - //CHECK: @llvm.x86.avx2.pmul.dq + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: mul <4 x i64> %{{.*}}, %{{.*}} //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epi32(__W, __M, __X, __Y); } __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: @test_mm256_maskz_mul_epi32 - //CHECK: @llvm.x86.avx2.pmul.dq + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: sext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: mul <4 x i64> %{{.*}}, %{{.*}} //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epi32(__M, __X, __Y); } @@ -743,14 +751,22 @@ __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: @test_mm_mask_mul_epi32 - //CHECK: @llvm.x86.sse41.pmuldq + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: mul <2 x i64> %{{.*}}, %{{.*}} //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epi32(__W, __M, __X, __Y); } __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: @test_mm_maskz_mul_epi32 - //CHECK: @llvm.x86.sse41.pmuldq + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: mul <2 x i64> %{{.*}}, %{{.*}} //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epi32(__M, __X, __Y); } @@ -758,14 +774,22 @@ __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: @test_mm256_mask_mul_epu32 - //CHECK: @llvm.x86.avx2.pmulu.dq + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: mul <4 x i64> %{{.*}}, %{{.*}} //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epu32(__W, __M, __X, __Y); } __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: @test_mm256_maskz_mul_epu32 - //CHECK: @llvm.x86.avx2.pmulu.dq + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32> + //CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: zext <4 x i32> %{{.*}} to <4 x i64> + //CHECK: mul <4 x i64> %{{.*}}, %{{.*}} //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epu32(__M, __X, __Y); } @@ -773,14 +797,22 @@ __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: @test_mm_mask_mul_epu32 - //CHECK: @llvm.x86.sse2.pmulu.dq + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: mul <2 x i64> %{{.*}}, %{{.*}} //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epu32(__W, __M, __X, __Y); } __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: @test_mm_maskz_mul_epu32 - //CHECK: @llvm.x86.sse2.pmulu.dq + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + //CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + //CHECK: mul <2 x i64> %{{.*}}, %{{.*}} //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epu32(__M, __X, __Y); } Index: test/CodeGen/sse2-builtins.c =================================================================== --- test/CodeGen/sse2-builtins.c +++ test/CodeGen/sse2-builtins.c @@ -816,7 +816,11 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_mul_epu32 - // CHECK: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + // CHECK: zext <2 x i32> %{{.*}} to <2 x i64> + // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_epu32(A, B); } Index: test/CodeGen/sse41-builtins.c =================================================================== --- test/CodeGen/sse41-builtins.c +++ test/CodeGen/sse41-builtins.c @@ -312,7 +312,11 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mul_epi32 - // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + // CHECK: sext <2 x i32> %{{.*}} to <2 x i64> + // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_epi32(x, y); }