Index: include/clang/Basic/BuiltinsX86.def =================================================================== --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -332,7 +332,6 @@ BUILTIN(__builtin_ia32_monitor, "vv*UiUi", "") BUILTIN(__builtin_ia32_mwait, "vUiUi", "") BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "") -BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIc", "") BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "") BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "") @@ -507,7 +506,6 @@ BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "") BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "") BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "") -BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIc", "") BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "") BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "") BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "") Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -5926,42 +5926,6 @@ Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy); return Builder.CreateStore(Ops[1], Ops[0]); } - case X86::BI__builtin_ia32_palignr128: - case X86::BI__builtin_ia32_palignr256: { - unsigned ShiftVal = cast(Ops[2])->getZExtValue(); - - unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); - assert(NumElts % 16 == 0); - unsigned NumLanes = NumElts / 16; - unsigned NumLaneElts = NumElts / NumLanes; - - // If palignr is shifting the pair of vectors more than the size of two - // lanes, emit zero. - if (ShiftVal >= (2 * NumLaneElts)) - return llvm::Constant::getNullValue(ConvertType(E->getType())); - - // If palignr is shifting the pair of input vectors more than one lane, - // but less than two lanes, convert to shifting in zeroes. - if (ShiftVal > NumLaneElts) { - ShiftVal -= NumLaneElts; - Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType()); - } - - SmallVector Indices; - // 256-bit palignr operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - unsigned Idx = ShiftVal + i; - if (Idx >= NumLaneElts) - Idx += NumElts - NumLaneElts; // End of lane, switch operand. - Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx + l)); - } - } - - Value* SV = llvm::ConstantVector::get(Indices); - return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr"); - } case X86::BI__builtin_ia32_pslldqi256: { // Shift value is in bits so divide by 8. unsigned shiftVal = cast(Ops[1])->getZExtValue() >> 3; Index: lib/Headers/avx2intrin.h =================================================================== --- lib/Headers/avx2intrin.h +++ lib/Headers/avx2intrin.h @@ -121,10 +121,42 @@ return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); } -#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \ - __m256i __a = (a); \ - __m256i __b = (b); \ - (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); }) +#define _mm256_alignr_epi8(a, b, imm) __extension__ ({ \ + __m256i __a = (((imm)&0xFF) > 31 ? _mm256_setzero_si256() : (__m256i)(b)); \ + __m256i __b = (((imm)&0xFF) > 15 ? _mm256_setzero_si256() : (__m256i)(a)); \ + (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, \ + ( 0+((imm)&0xF)+(( 0+((imm)&0xF))&0x10)), \ + ( 1+((imm)&0xF)+(( 1+((imm)&0xF))&0x10)), \ + ( 2+((imm)&0xF)+(( 2+((imm)&0xF))&0x10)), \ + ( 3+((imm)&0xF)+(( 3+((imm)&0xF))&0x10)), \ + ( 4+((imm)&0xF)+(( 4+((imm)&0xF))&0x10)), \ + ( 5+((imm)&0xF)+(( 5+((imm)&0xF))&0x10)), \ + ( 6+((imm)&0xF)+(( 6+((imm)&0xF))&0x10)), \ + ( 7+((imm)&0xF)+(( 7+((imm)&0xF))&0x10)), \ + ( 8+((imm)&0xF)+(( 8+((imm)&0xF))&0x10)), \ + ( 9+((imm)&0xF)+(( 9+((imm)&0xF))&0x10)), \ + (10+((imm)&0xF)+((10+((imm)&0xF))&0x10)), \ + (11+((imm)&0xF)+((11+((imm)&0xF))&0x10)), \ + (12+((imm)&0xF)+((12+((imm)&0xF))&0x10)), \ + (13+((imm)&0xF)+((13+((imm)&0xF))&0x10)), \ + (14+((imm)&0xF)+((14+((imm)&0xF))&0x10)), \ + (15+((imm)&0xF)+((15+((imm)&0xF))&0x10)), \ + (16+((imm)&0xF)+(( 0+((imm)&0xF))&0x10)), \ + (17+((imm)&0xF)+(( 1+((imm)&0xF))&0x10)), \ + (18+((imm)&0xF)+(( 2+((imm)&0xF))&0x10)), \ + (19+((imm)&0xF)+(( 3+((imm)&0xF))&0x10)), \ + (20+((imm)&0xF)+(( 4+((imm)&0xF))&0x10)), \ + (21+((imm)&0xF)+(( 5+((imm)&0xF))&0x10)), \ + (22+((imm)&0xF)+(( 6+((imm)&0xF))&0x10)), \ + (23+((imm)&0xF)+(( 7+((imm)&0xF))&0x10)), \ + (24+((imm)&0xF)+(( 8+((imm)&0xF))&0x10)), \ + (25+((imm)&0xF)+(( 9+((imm)&0xF))&0x10)), \ + (26+((imm)&0xF)+((10+((imm)&0xF))&0x10)), \ + (27+((imm)&0xF)+((11+((imm)&0xF))&0x10)), \ + (28+((imm)&0xF)+((12+((imm)&0xF))&0x10)), \ + (29+((imm)&0xF)+((13+((imm)&0xF))&0x10)), \ + (30+((imm)&0xF)+((14+((imm)&0xF))&0x10)), \ + (31+((imm)&0xF)+((15+((imm)&0xF))&0x10))); }) static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_and_si256(__m256i __a, __m256i __b) Index: lib/Headers/tmmintrin.h =================================================================== --- lib/Headers/tmmintrin.h +++ lib/Headers/tmmintrin.h @@ -66,10 +66,26 @@ return (__m128i)__builtin_ia32_pabsd128((__v4si)__a); } -#define _mm_alignr_epi8(a, b, n) __extension__ ({ \ - __m128i __a = (a); \ - __m128i __b = (b); \ - (__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); }) +#define _mm_alignr_epi8(a, b, imm) __extension__ ({ \ + __m128i __a = (((imm)&0xFF) > 31 ? _mm_setzero_si128() : (__m128i)(b)); \ + __m128i __b = (((imm)&0xFF) > 15 ? _mm_setzero_si128() : (__m128i)(a)); \ + (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, \ + ( 0+((imm)&0xF)), \ + ( 1+((imm)&0xF)), \ + ( 2+((imm)&0xF)), \ + ( 3+((imm)&0xF)), \ + ( 4+((imm)&0xF)), \ + ( 5+((imm)&0xF)), \ + ( 6+((imm)&0xF)), \ + ( 7+((imm)&0xF)), \ + ( 8+((imm)&0xF)), \ + ( 9+((imm)&0xF)), \ + (10+((imm)&0xF)), \ + (11+((imm)&0xF)), \ + (12+((imm)&0xF)), \ + (13+((imm)&0xF)), \ + (14+((imm)&0xF)), \ + (15+((imm)&0xF))); }) #define _mm_alignr_pi8(a, b, n) __extension__ ({ \ __m64 __a = (a); \ Index: test/CodeGen/avx2-builtins.c =================================================================== --- test/CodeGen/avx2-builtins.c +++ test/CodeGen/avx2-builtins.c @@ -96,7 +96,7 @@ } __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) { - // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> return _mm256_alignr_epi8(a, b, 17); } Index: test/CodeGen/builtins-x86.c =================================================================== --- test/CodeGen/builtins-x86.c +++ test/CodeGen/builtins-x86.c @@ -48,8 +48,8 @@ void* tmp_vp; const void* tmp_vCp; - char* tmp_cp; - const char* tmp_cCp; + char* tmp_cp; + const char* tmp_cCp; int* tmp_ip; float* tmp_fp; const float* tmp_fCp; @@ -350,7 +350,6 @@ (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); - tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i); tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); Index: test/CodeGen/palignr.c =================================================================== --- test/CodeGen/palignr.c +++ test/CodeGen/palignr.c @@ -1,31 +0,0 @@ -// REQUIRES: x86-registered-target -// RUN: %clang_cc1 %s -triple=i686-apple-darwin -target-feature +ssse3 -O1 -S -o - | FileCheck %s - -#define _mm_alignr_epi8(a, b, n) (__builtin_ia32_palignr128((a), (b), (n))) -typedef __attribute__((vector_size(16))) int int4; - -// CHECK: palignr -int4 align1(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 15); } -// CHECK: ret -// CHECK: ret -// CHECK-NOT: palignr -int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); } -// CHECK: psrldq -int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); } -// CHECK: xor -int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); } - -#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n))) -typedef __attribute__((vector_size(8))) int int2; - -// CHECK: palignr -int2 align5(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 8); } - -// CHECK: palignr -int2 align6(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 9); } - -// CHECK: palignr -int2 align7(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 16); } - -// CHECK: palignr -int2 align8(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 7); } Index: test/CodeGen/sse-builtins.c =================================================================== --- test/CodeGen/sse-builtins.c +++ test/CodeGen/sse-builtins.c @@ -574,6 +574,6 @@ } __m128i test2_mm_alignr_epi8(__m128i a, __m128i b) { - // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> + // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> return _mm_alignr_epi8(a, b, 17); }