Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -10068,6 +10068,15 @@ // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + // x86 allows load folding with blendvb from the 2nd source operand. But + // we are still using LLVM select here (see comment below), so that's V1. + // If V2 can be load-folded and V1 cannot be load-folded, then commute to + // allow that load-folding possibility. + if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } + // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -601,17 +601,15 @@ ; SSE41-LABEL: load_fold_pblendvb: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: load_fold_pblendvb: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1656,9 +1656,8 @@ ; ; AVX2-LABEL: load_fold_pblendvb: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb: