Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -7923,6 +7923,45 @@ return EmitX86Select(*this, Ops[4], Align, Ops[3]); } + case X86::BI__builtin_ia32_vperm2f128_pd256: + case X86::BI__builtin_ia32_vperm2f128_ps256: + case X86::BI__builtin_ia32_vperm2f128_si256: + case X86::BI__builtin_ia32_permti256: { + unsigned Imm = cast(Ops[2])->getZExtValue(); + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + + // This takes a very simple approach since there are two lanes and a + // shuffle can have 2 inputs. So we reserve the first input for the first + // lane and the second input for the second lane. This may result in + // duplicate sources, but this can be dealt with in the backend. + + Value *OutOps[2]; + uint32_t Indices[8]; + for (unsigned l = 0; l != 2; ++l) { + // Determine the source for this lane. + if (Imm & (1 << ((l * 4) + 3))) + OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType()); + else if (Imm & (1 << ((l * 4) + 1))) + OutOps[l] = Ops[1]; + else + OutOps[l] = Ops[0]; + + for (unsigned i = 0; i != NumElts/2; ++i) { + // Start with ith element of the source for this lane. + unsigned Idx = (l * NumElts) + i; + // If bit 0 of the immediate half is set, switch to the high half of + // the source. + if (Imm & (1 << (l * 4))) + Idx += NumElts/2; + Indices[(l * (NumElts/2)) + i] = Idx; + } + } + + return Builder.CreateShuffleVector(OutOps[0], OutOps[1], + makeArrayRef(Indices, NumElts), + "vperm"); + } + case X86::BI__builtin_ia32_movnti: case X86::BI__builtin_ia32_movnti64: case X86::BI__builtin_ia32_movntsd: Index: cfe/trunk/test/CodeGen/avx-builtins.c =================================================================== --- cfe/trunk/test/CodeGen/avx-builtins.c +++ cfe/trunk/test/CodeGen/avx-builtins.c @@ -678,19 +678,19 @@ __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_permute2f128_pd - // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 49) + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_permute2f128_pd(A, B, 0x31); } __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_permute2f128_ps - // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 19) + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_permute2f128_ps(A, B, 0x13); } __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) { // CHECK-LABEL: test_mm256_permute2f128_si256 - // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 32) + // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_permute2f128_si256(A, B, 0x20); } Index: cfe/trunk/test/CodeGen/avx2-builtins.c =================================================================== --- cfe/trunk/test/CodeGen/avx2-builtins.c +++ cfe/trunk/test/CodeGen/avx2-builtins.c @@ -907,8 +907,8 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_permute2x128_si256 - // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49) - return _mm256_permute2x128_si256(a, b, 0x31); + // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x i32> + return _mm256_permute2x128_si256(a, b, 0x38); } __m256i test_mm256_permute4x64_epi64(__m256i a) {