Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6796,8 +6796,19 @@ TTI::CastContextHint::None, CostKind); } if (E->ReuseShuffleIndices.empty()) { - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + TargetTransformInfo::ShuffleKind ShuffleKind = + TargetTransformInfo::SK_Select; + const TreeEntry *TE = nullptr; + for (unsigned I = 0, N = E->getNumOperands(); I < N; ++I) { + const TreeEntry *Op = getVectorizedOperand(E, I); + if (Op && !TE) + TE = Op; + if (Op && TE && (TE != Op)) { + ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc; + break; + } + } + VecCost += TTI->getShuffleCost(ShuffleKind, FinalVecTy); } else { SmallVector Mask; buildShuffleEntryMask( Index: llvm/test/Transforms/SLPVectorizer/AArch64/depend-node-shuffle.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AArch64/depend-node-shuffle.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-linux-gnu" + +define void @foo([3 x { float, float }]* %a, [3 x { float, float }]* %c, float* %arrayidx16.imagp, float %arrayidx12.real, float %arrayidx12.imag) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX12_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* [[A:%.*]], i64 0, i64 0, i32 0 +; CHECK-NEXT: [[ARRAYIDX12_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* [[A]], i64 0, i64 0, i32 1 +; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* [[C:%.*]], i64 0, i64 0, i32 0 +; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* [[C]], i64 0, i64 0, i32 1 +; CHECK-NEXT: [[ARRAYIDX12_REAL1:%.*]] = load float, float* [[ARRAYIDX12_REALP]], align 4 +; CHECK-NEXT: [[ARRAYIDX16_IMAG:%.*]] = load float, float* [[ARRAYIDX16_IMAGP:%.*]], align 4 +; CHECK-NEXT: [[MUL_AD:%.*]] = fmul fast float [[ARRAYIDX12_IMAG:%.*]], [[ARRAYIDX12_REAL:%.*]] +; CHECK-NEXT: [[ARRAYIDX12_IMAG3:%.*]] = load float, float* [[ARRAYIDX12_IMAGP]], align 4 +; CHECK-NEXT: [[MUL_BC:%.*]] = fmul fast float [[ARRAYIDX16_IMAG]], [[ARRAYIDX12_IMAG3]] +; CHECK-NEXT: [[MUL_I:%.*]] = fadd fast float [[MUL_BC]], [[MUL_AD]] +; CHECK-NEXT: [[MUL_AC:%.*]] = fmul fast float [[ARRAYIDX16_IMAG]], [[ARRAYIDX12_REAL1]] +; CHECK-NEXT: [[TMP0:%.*]] = fmul fast float [[ARRAYIDX12_REAL]], [[ARRAYIDX12_REAL]] +; CHECK-NEXT: [[MUL_R:%.*]] = fsub fast float [[MUL_AC]], [[TMP0]] +; CHECK-NEXT: store float [[MUL_R]], float* [[ARRAYIDX5_REALP]], align 4 +; CHECK-NEXT: store float [[MUL_I]], float* [[ARRAYIDX5_IMAGP]], align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx12.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 0, i64 0, i32 0 + %arrayidx12.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 0, i64 0, i32 1 + %arrayidx5.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %c, i64 0, i64 0, i32 0 + %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %c, i64 0, i64 0, i32 1 + %arrayidx12.real1 = load float, float* %arrayidx12.realp, align 4 + %arrayidx16.imag = load float, float* %arrayidx16.imagp, align 4 + %mul_ad = fmul fast float %arrayidx12.imag, %arrayidx12.real + %arrayidx12.imag3 = load float, float* %arrayidx12.imagp, align 4 + %mul_bc = fmul fast float %arrayidx16.imag, %arrayidx12.imag3 + %mul_i = fadd fast float %mul_bc, %mul_ad + %mul_ac = fmul fast float %arrayidx16.imag, %arrayidx12.real1 + %0 = fmul fast float %arrayidx12.real, %arrayidx12.real + %mul_r = fsub fast float %mul_ac, %0 + store float %mul_r, float* %arrayidx5.realp, align 4 + store float %mul_i, float* %arrayidx5.imagp, align 4 + ret void +} Index: llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,21 +7,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -47,21 +46,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -88,21 +86,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -129,21 +126,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -170,21 +166,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul nnan <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul nnan <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub nnan <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd nnan <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -264,19 +259,19 @@ define void @slp_profitable(ptr %A, ptr %B, float %0) { ; CHECK-LABEL: @slp_profitable( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[SUB_I1096]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x float> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[MUL_I1100:%.*]] = fmul fast float [[TMP1]], [[SUB_I1096]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[MUL7_I1101:%.*]] = fmul fast float [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[ADD_I1102:%.*]] = fadd fast float [[MUL7_I1101]], [[MUL_I1100]] +; CHECK-NEXT: [[MUL14_I:%.*]] = fmul fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], [[SUB_I1096]] +; CHECK-NEXT: [[ADD15_I:%.*]] = fsub fast float [[MUL14_I]], [[TMP3]] +; CHECK-NEXT: store float [[ADD_I1102]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 1 +; CHECK-NEXT: store float [[ADD15_I]], ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,12 +6,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i64 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i64 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i64 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2_0]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i64 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -30,17 +37,21 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 +; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 +; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 +; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 +; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 +; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 +; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 +; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 @@ -66,12 +77,30 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = sub <2 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> [[TMP20]], <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = add <2 x i32> [[TMP21]], [[TMP16]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP22]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,12 +6,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i64 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i64 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i64 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i64 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -30,17 +37,21 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 +; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 +; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 +; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 +; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 +; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 +; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 +; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 @@ -66,12 +77,30 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = sub <2 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> [[TMP20]], <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = add <2 x i32> [[TMP21]], [[TMP16]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP22]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -49,11 +49,37 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP3]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP6]], [[TMP7]] +; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP6]], [[TMP7]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP10]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R71]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -2,9 +2,9 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512SKX define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( @@ -49,11 +49,49 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @fmul_fdiv_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @fmul_fdiv_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP3]] +; +; SLM-LABEL: @fmul_fdiv_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> +; SLM-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP6]], [[TMP7]] +; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP6]], [[TMP7]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP10]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R71]] +; +; AVX-LABEL: @fmul_fdiv_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @fmul_fdiv_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @fmul_fdiv_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512SKX-LABEL: @fmul_fdiv_v8f32( +; AVX512SKX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; AVX512SKX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; AVX512SKX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; AVX512SKX-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -110,9 +148,17 @@ ; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX-NEXT: ret <4 x float> [[TMP1]] ; +; AVX2-LABEL: @fmul_fdiv_v4f32_const( +; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX2-NEXT: ret <4 x float> [[TMP1]] +; ; AVX512-LABEL: @fmul_fdiv_v4f32_const( ; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], ; AVX512-NEXT: ret <4 x float> [[TMP1]] +; +; AVX512SKX-LABEL: @fmul_fdiv_v4f32_const( +; AVX512SKX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; AVX512SKX-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -112,8 +112,10 @@ ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] Index: llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -112,8 +112,10 @@ ; SLM-LABEL: @ashr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] ; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x i32> [[TMP3]] +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]