Index: llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1161,6 +1161,17 @@ return I->getOperand(0); } + // If we only demand the element that is being inserted and that element + // was extracted from the same index in another vector with the same type, + // replace this insert with that other vector. + Value *Vec; + if (PreInsertDemandedElts == 0 && + match(I->getOperand(1), + m_ExtractElt(m_Value(Vec), m_SpecificInt(IdxNo))) && + Vec->getType() == I->getType()) { + return Vec; + } + // The inserted element is defined. UndefElts.clearBit(IdxNo); break; Index: llvm/test/Transforms/InstCombine/vec_demanded_elts.ll =================================================================== --- llvm/test/Transforms/InstCombine/vec_demanded_elts.ll +++ llvm/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -749,9 +749,7 @@ define <4 x float> @ins_of_ext(<4 x float> %x, float %y) { ; CHECK-LABEL: @ins_of_ext( -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] @@ -766,11 +764,7 @@ define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) { ; CHECK-LABEL: @ins_of_ext_twice( -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0 -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x float> [[X]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[E1]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y:%.*]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 2 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; @@ -783,6 +777,9 @@ ret <4 x float> %i3 } +; Negative test - element 3 of the result must be undef to be poison safe. +; TODO: Could convert insert/extract to identity shuffle with undef mask elements. + define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) { ; CHECK-LABEL: @ins_of_ext_wrong_demand( ; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0 @@ -798,6 +795,9 @@ ret <4 x float> %i2 } +; Negative test - can't replace i0 with x. +; TODO: Could convert insert/extract to identity shuffle with undef mask elements. + define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) { ; CHECK-LABEL: @ins_of_ext_wrong_type( ; CHECK-NEXT: [[E0:%.*]] = extractelement <5 x float> [[X:%.*]], i32 0 Index: llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -51,13 +51,13 @@ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP20:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0 @@ -83,12 +83,11 @@ ; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]] ; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> , i32 [[T12]], i32 1 -; CHECK-NEXT: [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD11]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> , i32 [[T12]], i32 1 +; CHECK-NEXT: [[TMP20]] = add nsw <2 x i32> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP21]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; entry: @@ -157,13 +156,13 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP4]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64 @@ -188,12 +187,11 @@ ; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]] ; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> , i32 [[T12]], i32 1 -; CHECK-NEXT: [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[ADD11]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> , i32 [[T12]], i32 1 +; CHECK-NEXT: [[TMP17]] = add nsw <2 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP18]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -183,13 +183,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void