Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1347,6 +1347,9 @@ I.setHasNoUnsignedWrap(true); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -1452,6 +1455,9 @@ return replaceInstUsesWith(I, V); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -1723,6 +1729,9 @@ I.setHasNoUnsignedWrap(true); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -1777,5 +1786,8 @@ return replaceInstUsesWith(I, V); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1427,6 +1427,9 @@ if (Instruction *Select = foldBoolSextMaskToSelect(I)) return Select; + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -2342,6 +2345,9 @@ } } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -2637,5 +2643,8 @@ if (Instruction *CastedXor = foldCastedBitwiseLogic(I)) return CastedXor; + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } Index: lib/Transforms/InstCombine/InstCombineCompares.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCompares.cpp +++ lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4528,6 +4528,10 @@ if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X) return foldICmpAddOpConst(I, X, Cst, I.getSwappedPredicate()); } + + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -4948,5 +4952,8 @@ return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0), RHSExt->getOperand(0)); + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -565,6 +565,11 @@ Value *SimplifyVectorOp(BinaryOperator &Inst); Value *SimplifyBSwap(BinaryOperator &Inst); + /// Try to combine instructions with all ExtractElement operands only + /// that extract from vector operands of the same type, the same vector + /// size at the same index, into a vector form with single resulting + /// ExtractElement instruction only. + Value *WidenScalarOp(Instruction &Inst); /// Given a binary operator, cast instruction, or select which has a PHI node /// as operand #0, see if we can fold the instruction into the PHI (which is Index: lib/Transforms/InstCombine/InstCombineMulDivRem.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -463,6 +463,9 @@ I.setHasNoUnsignedWrap(true); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -778,6 +781,9 @@ break; } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return Changed ? &I : nullptr; } @@ -1176,6 +1182,9 @@ return Inst; } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -1260,6 +1269,9 @@ } } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -1430,6 +1442,9 @@ return &I; } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -1518,6 +1533,9 @@ return SelectInst::Create(Cmp, Op0, Sub); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -1593,6 +1611,9 @@ } } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -1610,5 +1631,8 @@ if (isa(Op1) && SimplifyDivRemOfSelect(I)) return &I; + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } Index: lib/Transforms/InstCombine/InstCombinePHI.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombinePHI.cpp +++ lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -1014,5 +1014,8 @@ if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) return Res; + if (Value *V = WidenScalarOp(PN)) + return replaceInstUsesWith(PN, V); + return nullptr; } Index: lib/Transforms/InstCombine/InstCombineShifts.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineShifts.cpp +++ lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -610,6 +610,9 @@ return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1)); } + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -694,6 +697,10 @@ return &I; } } + + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } @@ -763,5 +770,8 @@ if (MaskedValueIsZero(Op0, APInt::getSignBit(BitWidth), 0, &I)) return BinaryOperator::CreateLShr(Op0, Op1); + if (Value *V = WidenScalarOp(I)) + return replaceInstUsesWith(I, V); + return nullptr; } Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1397,6 +1397,66 @@ return nullptr; } +Value *InstCombiner::WidenScalarOp(Instruction &Inst) { + if (Inst.getType()->isVectorTy() || + !VectorType::isValidElementType(Inst.getType()) || + Inst.mayHaveSideEffects()) + return nullptr; + auto *EI = dyn_cast(*Inst.op_begin()); + if (!EI) + return nullptr; + unsigned NE = EI->getVectorOperandType()->getNumElements(); + Value *EIdx = EI->getIndexOperand(); + // Check that all operands of the user instruction are extractelement + // from the vectors of the same size and from the same lanes and the vector + // operand is not an insertelement instruction (this sequence is handled + // differently). + if (!std::all_of(Inst.op_begin(), Inst.op_end(), [NE, EIdx](const Value *V) { + auto *EEI = dyn_cast(V); + return EEI && !isa(EEI->getVectorOperand()) && + EEI->getVectorOperandType()->getNumElements() == NE && + EEI->getIndexOperand() == EIdx; + })) + return nullptr; + int NumVectorizedExtracts = 0; + SmallSet CountedOperands; + for (auto *Op : Inst.operand_values()) { + auto *EEOp = cast(Op); + const Instruction *UserLast = EEOp->user_back(); + // If the only user of the extractelement instruction is the + // to-be-vectorized user instruction, count this instruction as the + // one to be removed. + if (EEOp->hasOneUse() || + (std::all_of(EEOp->user_begin(), EEOp->user_end(), + [UserLast](User *U) { return U == UserLast; }) && + CountedOperands.insert(EEOp).second)) + ++NumVectorizedExtracts; + } + // If the number of extractelement instructions to be removed does not exceed + // 1, do not widen this instruction sequence. + if (NumVectorizedExtracts <= 1) + return nullptr; + // Generate vector code instead of the scalar one. + Instruction *NewI = Inst.clone(); + NewI->setName("widen.vect"); + NewI->mutateType(VectorType::get(Inst.getType(), NE)); + for (unsigned Idx = 0, EIdx = NewI->getNumOperands(); Idx < EIdx; ++Idx) { + auto *EE = cast(NewI->getOperand(Idx)); + NewI->setOperand(Idx, EE->getVectorOperand()); + } + for (auto *V : Inst.operand_values()) { + // Remove extractelement instructions. + if (auto *I = dyn_cast(V)) + replaceInstUsesWith(*I, UndefValue::get(V->getType())); + } + InsertNewInstWith(NewI, Inst); + // %widen.extract = extractelement %widen.vect, i32 Idx + // Replace uses of the scalar instruction by the %widen.extract + // instruction. + return InsertNewInstWith( + ExtractElementInst::Create(NewI, EIdx, "widen.extract"), Inst); +} + Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector Ops(GEP.op_begin(), GEP.op_end()); Index: test/Transforms/InstCombine/bitcast-bigendian.ll =================================================================== --- test/Transforms/InstCombine/bitcast-bigendian.ll +++ test/Transforms/InstCombine/bitcast-bigendian.ll @@ -9,11 +9,10 @@ define float @test2(<2 x float> %A, <2 x i32> %B) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 1 ; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]] -; CHECK-NEXT: ret float [[ADD]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <2 x float> [[BC]], [[A:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x float> [[WIDEN_VECT]], i32 1 +; CHECK-NEXT: ret float [[WIDEN_EXTRACT]] ; %tmp28 = bitcast <2 x float> %A to i64 %tmp23 = trunc i64 %tmp28 to i32 Index: test/Transforms/InstCombine/bitcast.ll =================================================================== --- test/Transforms/InstCombine/bitcast.ll +++ test/Transforms/InstCombine/bitcast.ll @@ -244,11 +244,10 @@ ; rdar://7892780 define float @test2(<2 x float> %A, <2 x i32> %B) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]] -; CHECK-NEXT: ret float [[ADD]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <2 x float> [[BC]], [[A:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x float> [[WIDEN_VECT]], i32 0 +; CHECK-NEXT: ret float [[WIDEN_EXTRACT]] ; %tmp28 = bitcast <2 x float> %A to i64 ; [#uses=2] %tmp23 = trunc i64 %tmp28 to i32 ; [#uses=1] Index: test/Transforms/InstCombine/type_pun.ll =================================================================== --- test/Transforms/InstCombine/type_pun.ll +++ test/Transforms/InstCombine/type_pun.ll @@ -118,14 +118,12 @@ ; CHECK-NEXT: [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32> ; CHECK-NEXT: br i1 undef, label [[LEFT:%.*]], label [[RIGHT:%.*]] ; CHECK: left: -; CHECK-NEXT: [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0 ; CHECK-NEXT: br label [[TAIL:%.*]] ; CHECK: right: -; CHECK-NEXT: [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0 ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[SROA_EXTRACT1]], [[LEFT]] ], [ [[SROA_EXTRACT]], [[RIGHT]] ] -; CHECK-NEXT: ret i32 [[I]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0 +; CHECK-NEXT: ret i32 [[WIDEN_EXTRACT]] ; entry: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> Index: test/Transforms/InstCombine/x86-avx512.ll =================================================================== --- test/Transforms/InstCombine/x86-avx512.ll +++ test/Transforms/InstCombine/x86-avx512.ll @@ -6,11 +6,10 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_add_ss( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -33,15 +32,14 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_add_ss_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -83,11 +81,9 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_add_sd( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP4]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4) @@ -106,15 +102,14 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_add_sd_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP5]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) @@ -148,11 +143,10 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_sub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fsub <4 x float> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -175,15 +169,14 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_sub_ss_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -225,11 +218,9 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_sub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP4]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4) @@ -248,15 +239,14 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_sub_sd_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP5]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) @@ -290,11 +280,10 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_mul_ss( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fmul <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -317,15 +306,14 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mul_ss_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -367,11 +355,9 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_mul_sd( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP4]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4) @@ -390,15 +376,14 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mul_sd_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP5]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) @@ -432,11 +417,10 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_div_ss( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fdiv <4 x float> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -459,15 +443,14 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_div_ss_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 -; CHECK-NEXT: ret <4 x float> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -509,11 +492,9 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_div_sd( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP4]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4) @@ -532,15 +513,14 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_div_sd_mask( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 -; CHECK-NEXT: ret <2 x double> [[TMP8]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP5]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)