Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -8456,6 +8456,28 @@ Builder.getInt16Ty()); } + case X86::BI__builtin_ia32_kunpckdi: + case X86::BI__builtin_ia32_kunpcksi: + case X86::BI__builtin_ia32_kunpckhi: { + unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits(); + Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); + Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); + uint32_t Indices[64]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + + // First extract half of each vector. This gives better codegen than + // doing it in a single shuffle. + LHS = Builder.CreateShuffleVector(LHS, LHS, + makeArrayRef(Indices, NumElts / 2)); + RHS = Builder.CreateShuffleVector(RHS, RHS, + makeArrayRef(Indices, NumElts / 2)); + // Concat the vectors. + Value *Res = Builder.CreateShuffleVector(LHS, RHS, + makeArrayRef(Indices, NumElts)); + return Builder.CreateBitCast(Res, Ops[0]->getType()); + } + case X86::BI__builtin_ia32_vplzcntd_128_mask: case X86::BI__builtin_ia32_vplzcntd_256_mask: case X86::BI__builtin_ia32_vplzcntd_512_mask: Index: lib/Headers/avx512bwintrin.h =================================================================== --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -1854,13 +1854,15 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32)); + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { -return (__mmask32) (( __A & 0xFFFF) | ( __B << 16)); + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -8787,7 +8787,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) (( __A & 0xFF) | ( __B << 8)); + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: test/CodeGen/avx512bw-builtins.c =================================================================== --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1628,23 +1628,22 @@ __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: bitcast <64 x i1> %{{.*}} to i64 - // CHECK: bitcast <64 x i1> %{{.*}} to i64 - // CHECK: and i64 %{{.*}}, 4294967295 - // CHECK: shl i64 %{{.*}}, 32 - // CHECK: or i64 %{{.*}}, %{{.*}} - // CHECK: bitcast i64 %{{.*}} to <64 x i1> + // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // CHECK: [[LHS2:%.*]] = shufflevector <64 x i1> [[LHS]], <64 x i1> [[LHS]], <32 x i32> + // CHECK: [[RHS2:%.*]] = shufflevector <64 x i1> [[RHS]], <64 x i1> [[RHS]], <32 x i32> + // CHECK: [[CONCAT:%.*]] = shufflevector <32 x i1> [[LHS2]], <32 x i1> [[RHS2]], <64 x i32> + // CHECK: bitcast <64 x i1> [[CONCAT]] to i64 return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } __mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: bitcast <32 x i1> %{{.*}} to i32 - // CHECK: bitcast <32 x i1> %{{.*}} to i32 - // CHECK: and i32 %{{.*}}, 65535 - // CHECK: shl i32 %{{.*}}, 16 - // CHECK: or i32 %{{.*}}, %{{.*}} - // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // CHECK: [[LHS2:%.*]] = shufflevector <32 x i1> [[LHS]], <32 x i1> [[LHS]], <16 x i32> + // CHECK: [[RHS2:%.*]] = shufflevector <32 x i1> [[RHS]], <32 x i1> [[RHS]], <16 x i32> + // CHECK: [[CONCAT:%.*]] = shufflevector <16 x i1> [[LHS2]], <16 x i1> [[RHS2]], <32 x i32> return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } Index: test/CodeGen/avx512f-builtins.c =================================================================== --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6261,12 +6261,12 @@ __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: bitcast <16 x i1> %{{.*}} to i16 - // CHECK: bitcast <16 x i1> %{{.*}} to i16 - // CHECK: and i32 %{{.*}}, 255 - // CHECK: shl i32 %{{.*}}, 8 - // CHECK: or i32 %{{.*}}, %{{.*}} - // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: [[LHS2:%.*]] = shufflevector <16 x i1> [[LHS]], <16 x i1> [[LHS]], <8 x i32> + // CHECK: [[RHS2:%.*]] = shufflevector <16 x i1> [[RHS]], <16 x i1> [[RHS]], <8 x i32> + // CHECK: [[CONCAT:%.*]] = shufflevector <8 x i1> [[LHS2]], <8 x i1> [[RHS2]], <16 x i32> + // CHECK: bitcast <16 x i1> [[CONCAT]] to i16 return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B), _mm512_cmpneq_epu32_mask(__C, __D)), __E, __F);