Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2315,10 +2315,20 @@ case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: + case Intrinsic::x86_avx512_pshuf_b_512: { if (Value *V = simplifyX86pshufb(*II, *Builder)) return replaceInstUsesWith(*II, V); + + unsigned VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt DemandedElts = APInt::getAllOnesValue(VWidth); + if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } break; + } case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1472,6 +1472,16 @@ break; } + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: { + Value *Op1 = II->getArgOperand(1); + TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts, + Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + break; + } + // SSE4A instructions leave the upper 64-bits of the 128-bit result // in an undefined state. case Intrinsic::x86_sse4a_extrq: Index: llvm/trunk/test/Transforms/InstCombine/x86-pshufb.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-pshufb.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-pshufb.ll @@ -469,15 +469,12 @@ } ; Demanded elts tests. -; FIXME: Missed opportunities to pass demanded elts through the pshufb shuffle mask define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) { ; CHECK-LABEL: @demanded_elts_insertion( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 %M15, i32 15 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] ; %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <16 x i8> %1, i8 %M15, i32 15 @@ -489,9 +486,8 @@ define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) { ; CHECK-LABEL: @demanded_elts_insertion_avx2( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> [[TMP1]], i8 %M22, i32 22 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP2]]) -; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]]) +; CHECK-NEXT: ret <32 x i8> [[TMP2]] ; %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <32 x i8> %1, i8 %M22, i32 22 @@ -502,11 +498,10 @@ define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) { ; CHECK-LABEL: @demanded_elts_insertion_avx512( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i8> [[TMP1]], i8 %M30, i32 30 -; CHECK-NEXT: [[TMP3:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> undef, <64 x i32> zeroinitializer -; CHECK-NEXT: ret <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: ret <64 x i8> [[TMP3]] ; %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <64 x i8> %1, i8 %M30, i32 30