Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -179,6 +179,8 @@ Name == "x86.sse2.psrl.dq.bs" || Name == "x86.avx2.psll.dq.bs" || Name == "x86.avx2.psrl.dq.bs" || + Name == "x86.ssse3.palign.r.128" || + Name == "x86.avx2.palign.r.256" || Name == "x86.sse41.pblendw" || Name == "x86.sse41.blendpd" || Name == "x86.sse41.blendps" || @@ -611,6 +613,50 @@ unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue(); Rep = UpgradeX86PSRLDQIntrinsics(Builder, C, CI->getArgOperand(0), 2, Shift); + } else if (Name == "llvm.x86.ssse3.palign.r.128" || + Name == "llvm.x86.avx2.palign.r.256") { + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + unsigned ShiftVal = cast(CI->getArgOperand(2))->getZExtValue(); + + VectorType *VecTy = cast(CI->getType()); + unsigned VecBitWidth = VecTy->getBitWidth(); + unsigned NumLanes = VecBitWidth / 128; + unsigned NumElts = NumLanes * 16; + unsigned NumLaneElts = NumElts / NumLanes; + assert(0 == (VecBitWidth % 128) && "Illegal vector width"); + VectorType *ShufTy = VectorType::get(Type::getInt8Ty(C), NumElts); + + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if (ShiftVal >= (2 * NumLaneElts)) { + Rep = llvm::Constant::getNullValue(CI->getType()); + } else { + // If palignr is shifting the pair of input vectors more than one lane, + // but less than two lanes, convert to shifting in zeroes. + if (ShiftVal > NumLaneElts) { + ShiftVal -= NumLaneElts; + Op0 = llvm::Constant::getNullValue(VecTy); + } + + Op0 = Builder.CreateBitCast(Op0, ShufTy); + Op1 = Builder.CreateBitCast(Op1, ShufTy); + + SmallVector Indices; + // 256-bit palignr operates on 128-bit lanes so we need to handle that + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Idx = ShiftVal + i; + if (Idx >= NumLaneElts) + Idx += NumElts - NumLaneElts; // End of lane, switch operand. + Indices.push_back(llvm::ConstantInt::get(Type::getInt32Ty(C), Idx + l)); + } + } + + Value* SV = llvm::ConstantVector::get(Indices); + Rep = Builder.CreateShuffleVector(Op1, Op0, SV); + Rep = Builder.CreateBitCast(Rep, VecTy); + } } else if (Name == "llvm.x86.sse41.pblendw" || Name == "llvm.x86.sse41.blendpd" || Name == "llvm.x86.sse41.blendps" || Index: test/CodeGen/X86/palignr-2.ll =================================================================== --- test/CodeGen/X86/palignr-2.ll +++ test/CodeGen/X86/palignr-2.ll @@ -8,8 +8,8 @@ define void @t1(<2 x i64> %a, <2 x i64> %b) nounwind ssp { entry: ; CHECK-LABEL: t1: -; palignr $3, %xmm1, %xmm0 - %0 = tail call <2 x i64> @llvm.x86.ssse3.palign.r.128(<2 x i64> %a, <2 x i64> %b, i8 24) nounwind readnone +; CHECK: palignr $3, %xmm1, %xmm0 + %0 = tail call <2 x i64> @llvm.x86.ssse3.palign.r.128(<2 x i64> %a, <2 x i64> %b, i8 3) nounwind readnone store <2 x i64> %0, <2 x i64>* bitcast ([4 x i32]* @c to <2 x i64>*), align 16 ret void } @@ -19,10 +19,10 @@ define void @t2() nounwind ssp { entry: ; CHECK-LABEL: t2: -; palignr $4, _b, %xmm0 +; CHECK: palignr $4, _b, %xmm0 %0 = load <2 x i64>, <2 x i64>* bitcast ([4 x i32]* @b to <2 x i64>*), align 16 ; <<2 x i64>> [#uses=1] %1 = load <2 x i64>, <2 x i64>* bitcast ([4 x i32]* @a to <2 x i64>*), align 16 ; <<2 x i64>> [#uses=1] - %2 = tail call <2 x i64> @llvm.x86.ssse3.palign.r.128(<2 x i64> %1, <2 x i64> %0, i8 32) nounwind readnone + %2 = tail call <2 x i64> @llvm.x86.ssse3.palign.r.128(<2 x i64> %1, <2 x i64> %0, i8 4) nounwind readnone store <2 x i64> %2, <2 x i64>* bitcast ([4 x i32]* @c to <2 x i64>*), align 16 ret void }