Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -993,22 +993,23 @@ break; } + // The element inserted overwrites whatever was there, so the input demanded + // set is simpler than the output set. + unsigned IdxNo = Idx->getZExtValue(); + APInt PreInsertDemandedElts = DemandedElts; + if (IdxNo < VWidth) + PreInsertDemandedElts.clearBit(IdxNo); + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), PreInsertDemandedElts, + UndefElts, Depth + 1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + // If this is inserting an element that isn't demanded, remove this // insertelement. - unsigned IdxNo = Idx->getZExtValue(); if (IdxNo >= VWidth || !DemandedElts[IdxNo]) { Worklist.Add(I); return I->getOperand(0); } - // Otherwise, the element inserted overwrites whatever was there, so the - // input demanded set is simpler than the output set. - APInt DemandedElts2 = DemandedElts; - DemandedElts2.clearBit(IdxNo); - TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2, - UndefElts, Depth + 1); - if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } - // The inserted element is defined. UndefElts.clearBit(IdxNo); break; Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1165,9 +1165,7 @@ if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) { if (V != &SVI) return replaceInstUsesWith(SVI, V); - LHS = SVI.getOperand(0); - RHS = SVI.getOperand(1); - MadeChange = true; + return &SVI; } unsigned LHSWidth = LHS->getType()->getVectorNumElements(); Index: llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll +++ llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll @@ -485,9 +485,8 @@ define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) { ; CHECK-LABEL: @demanded_elts_insertion_avx2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]]) -; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %BaseMask) +; CHECK-NEXT: ret <32 x i8> [[TMP1]] ; %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0 %2 = insertelement <32 x i8> %1, i8 %M22, i32 22 Index: llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll +++ llvm/trunk/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -142,13 +142,11 @@ ret <2 x i64> %result } -; FIXME: The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away. +; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away. define <4 x float> @inselt_shuf_no_demand(float %a1, float %a2, float %a3) { ; CHECK-LABEL: @inselt_shuf_no_demand( -; CHECK-NEXT: [[OUT1:%.*]] = insertelement <4 x float> undef, float %a1, i32 1 -; CHECK-NEXT: [[OUT12:%.*]] = insertelement <4 x float> [[OUT1]], float %a2, i32 2 -; CHECK-NEXT: ret <4 x float> [[OUT12]] +; CHECK-NEXT: ret <4 x float> undef ; %out1 = insertelement <4 x float> undef, float %a1, i32 1 %out12 = insertelement <4 x float> %out1, float %a2, i32 2 @@ -157,13 +155,11 @@ ret <4 x float> %shuffle } -; FIXME: The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away. +; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away. define <4 x float> @inselt_shuf_no_demand_commute(float %a1, float %a2, float %a3) { ; CHECK-LABEL: @inselt_shuf_no_demand_commute( -; CHECK-NEXT: [[OUT1:%.*]] = insertelement <4 x float> undef, float %a1, i32 1 -; CHECK-NEXT: [[OUT12:%.*]] = insertelement <4 x float> [[OUT1]], float %a2, i32 2 -; CHECK-NEXT: ret <4 x float> [[OUT12]] +; CHECK-NEXT: ret <4 x float> undef ; %out1 = insertelement <4 x float> undef, float %a1, i32 1 %out12 = insertelement <4 x float> %out1, float %a2, i32 2 @@ -172,15 +168,14 @@ ret <4 x float> %shuffle } -; FIXME: The add uses 'out012' giving it multiple uses after the shuffle is transformed to also +; The add uses 'out012' giving it multiple uses after the shuffle is transformed to also ; use 'out012'. The analysis should be able to see past that. define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b) { ; CHECK-LABEL: @inselt_shuf_no_demand_multiuse( ; CHECK-NEXT: [[OUT0:%.*]] = insertelement <4 x i32> undef, i32 %a0, i32 0 ; CHECK-NEXT: [[OUT01:%.*]] = insertelement <4 x i32> [[OUT0]], i32 %a1, i32 1 -; CHECK-NEXT: [[OUT012:%.*]] = insertelement <4 x i32> [[OUT01]], i32 %a0, i32 2 -; CHECK-NEXT: [[FOO:%.*]] = add <4 x i32> [[OUT012]], %b +; CHECK-NEXT: [[FOO:%.*]] = add <4 x i32> [[OUT01]], %b ; CHECK-NEXT: ret <4 x i32> [[FOO]] ; %out0 = insertelement <4 x i32> undef, i32 %a0, i32 0