Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19325,8 +19325,12 @@ // Merge this shuffle operation's mask into our accumulated mask. This is // a bit tricky as the shuffle may have a different size from the root. if (OpMask.size() == IncomingMask.size()) { - for (int M : IncomingMask) - Mask.push_back(OpMask[M]); + for (int M : IncomingMask) { + if (M == SM_SentinelZero) + Mask.push_back(SM_SentinelZero); + else + Mask.push_back(OpMask[M]); + } } else if (OpMask.size() < IncomingMask.size()) { assert(IncomingMask.size() % OpMask.size() == 0 && "The smaller number of elements must divide the larger."); Index: test/CodeGen/X86/pr20540.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr20540.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 | FileCheck %s + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) + +define void @f_fu_(float* noalias nocapture %RET, <8 x i8> %__mask) { + ; CHECK: pshufb + %mask8.i.i.i = shufflevector <8 x i8> %__mask, <8 x i8> zeroinitializer, <16 x i32> + %m.i.i.i = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8.i.i.i) + %RET.i = bitcast float* %RET to i32* + store i32 %m.i.i.i, i32* %RET.i + ret void +} +