Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7131,7 +7131,7 @@ const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (TreeEntry *E = getTreeEntry(S.OpValue)) { if (E->isSame(VL)) { Value *V = vectorizeTree(E); if (VF != cast(V->getType())->getNumElements()) { @@ -7181,7 +7181,40 @@ } } return V; + } else { + // If the values are not the same but a re-ordering of the existing + // node, create a shuffle for the new value. + auto IsReordering = [](TreeEntry *E, ArrayRef VL, + SmallVector &Reordering) { + for (Value *V : VL) { + auto It = find(E->Scalars, V); + if (It == E->Scalars.end()) + return false; + Reordering.push_back(E->findLaneForValue(V)); + } + return true; + }; + SmallVector Reordering; + if (IsReordering(E, VL, Reordering)) { + Value *V = vectorizeTree(E); + Value *V2 = PoisonValue::get(V->getType()); + // If the vectorized value is itself a shuffle, look through it to the + // underlying values. + if (auto *SV = dyn_cast(V)) { + for (int &E : Reordering) + E = E < 0 ? UndefMaskElem : SV->getMaskValue(E); + V = SV->getOperand(0); + V2 = SV->getOperand(1); + } + V = Builder.CreateShuffleVector(V, V2, Reordering, "reorder"); + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return V; + } } + } } // Can't vectorize this, so simply build a new vector with each lane Index: llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1312,30 +1312,30 @@ ; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], ; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]] ; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> undef, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER]] +; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[REORDER1:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP63]], [[REORDER1]] +; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP63]], [[REORDER1]] +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = add nsw <16 x i32> [[TMP66]], [[REORDER2]] +; CHECK-NEXT: [[TMP68:%.*]] = sub nsw <16 x i32> [[TMP66]], [[REORDER2]] +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> [[TMP68]], <16 x i32> +; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> [[TMP68]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP69]], [[REORDER3]] +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP69]], [[REORDER3]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = lshr <16 x i32> [[TMP72]], +; CHECK-NEXT: [[TMP74:%.*]] = and <16 x i32> [[TMP73]], +; CHECK-NEXT: [[TMP75:%.*]] = mul nuw <16 x i32> [[TMP74]], +; CHECK-NEXT: [[TMP76:%.*]] = add <16 x i32> [[TMP75]], [[TMP72]] +; CHECK-NEXT: [[TMP77:%.*]] = xor <16 x i32> [[TMP76]], [[TMP75]] +; CHECK-NEXT: [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP77]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP78]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP78]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -139,11 +139,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -139,11 +139,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -79,7 +79,7 @@ ; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP33:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> ; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = add <8 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) @@ -91,12 +91,10 @@ ; FORCE_REDUCTION-NEXT: [[OP_RDX16:%.*]] = and i32 [[OP_RDX15]], [[TMP27]] ; FORCE_REDUCTION-NEXT: [[OP_RDX17:%.*]] = and i32 [[OP_RDX16]], [[TMP28]] ; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX17]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP31]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP30]], [[TMP33]] -; FORCE_REDUCTION-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP30]], [[TMP33]] -; FORCE_REDUCTION-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = and <2 x i32> [[TMP30]], [[REORDER]] +; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = add <2 x i32> [[TMP30]], [[REORDER]] +; FORCE_REDUCTION-NEXT: [[TMP33]] = shufflevector <2 x i32> [[TMP31]], <2 x i32> [[TMP32]], <2 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -82,23 +82,19 @@ ; CHECK-NEXT: ret float [[ADD]] ; ; THRESH1-LABEL: @f_used_twice_in_tree( -; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH1-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH1-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH1-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH1-NEXT: [[REORDER:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH1-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER]], [[X]] +; THRESH1-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]] ; THRESH1-NEXT: ret float [[ADD]] ; ; THRESH2-LABEL: @f_used_twice_in_tree( -; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH2-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH2-NEXT: [[REORDER:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH2-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER]], [[X]] +; THRESH2-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; THRESH2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]] ; THRESH2-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -58,18 +58,11 @@ ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[REORDER]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 Index: llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -722,15 +722,12 @@ ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 -; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]] +; SSE-NEXT: [[REORDER:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[REORDER]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] ; SSE-NEXT: ret double [[ADD3]] ; ; AVX-LABEL: @splat_loads( @@ -791,17 +788,14 @@ ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i32 1 -; SSE-NEXT: [[TMP12:%.*]] = fsub <2 x double> [[TMP10]], [[TMP11]] -; SSE-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP12]], i32 0 -; SSE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP12]], i32 1 -; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP13]], [[TMP14]] +; SSE-NEXT: [[REORDER:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[REORDER]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[REORDER1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP6]], [[REORDER1]] +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; SSE-NEXT: ret double [[RES]] ; ; AVX-LABEL: @splat_loads_with_internal_uses( Index: llvm/test/Transforms/SLPVectorizer/X86/phi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -244,27 +244,20 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP9]] = fmul <4 x float> [[TMP8]], +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1]] = fmul <4 x float> [[REORDER]], ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP13]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP5]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -12,22 +12,15 @@ ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]] ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <4 x i32> [[TMP8]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <4 x i32> [[TMP8]], [[REORDER]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 16 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds i8, i8* undef, i64 4 Index: llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll @@ -7,21 +7,18 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]] ; CHECK: block1: ; CHECK-NEXT: br i1 false, label [[BLOCK2:%.*]], label [[BLOCK3]] ; CHECK: block2: ; CHECK-NEXT: br label [[BLOCK3]] ; CHECK: block3: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]] -; CHECK-NEXT: ret i32 [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[REORDER]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %arrayidx10 = getelementptr inbounds i32, i32* %isec, i32 0