Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -332,7 +332,7 @@ case Instruction::Sub: return Instruction::Add; default: - return 0; + return Op; } } @@ -345,6 +345,20 @@ return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode()) + && "Invalid Opcode"); + if (Opcode != Instruction::PHI && isa(I) && + (I->getType()->isIntegerTy() || I->hasUnsafeAlgebra())) + return I->getOpcode(); + return 0; +} + /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// OpValue. @@ -365,6 +379,12 @@ struct RawInstructionsData { /// Main Opcode of the instructions going to be vectorized. unsigned Opcode = 0; + /// Position of the first instruction with the \a Opcode. + unsigned OpcodePos = 0; + /// Need an additional analysis (if at least one of the instruction is not + /// same instruction kind as an instruction at OpcodePos position in the + /// list). + bool NeedAnalysis = false; /// The list of instructions have some instructions with alternate opcodes. bool HasAltOpcodes = false; }; @@ -378,16 +398,38 @@ return {}; RawInstructionsData Res; unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + unsigned NewOpcodePos = 0; // Walk through the list of the vectorized instructions // in order to check its structure described by RawInstructionsData. for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { auto *I = dyn_cast(VL[Cnt]); if (!I) return {}; - if (Opcode != I->getOpcode()) - Res.HasAltOpcodes = true; + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + if (Opcode != I->getOpcode()) { + Res.HasAltOpcodes = true; + if (Res.NeedAnalysis && isOdd(NewOpcodePos)) + std::swap(Opcode, AltOpcode); + } + continue; + } + if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) { + if (!Instruction::isBinaryOp(Opcode) || + !Instruction::isCommutative(Opcode)) { + NewOpcodePos = Cnt; + Opcode = NewOpcode; + AltOpcode = getAltOpcode(Opcode); + Res.NeedAnalysis = true; + } + } else if (tryToRepresentAsInstArg(I->getOpcode(), + cast(VL[NewOpcodePos]))) + Res.NeedAnalysis = true; + else + return {}; } Res.Opcode = Opcode; + Res.OpcodePos = NewOpcodePos; return Res; } @@ -412,16 +454,20 @@ static InstructionsState getSameOpcode(ArrayRef VL) { auto Res = getMainOpcode(VL); unsigned Opcode = Res.Opcode; - if (!Res.HasAltOpcodes) - return InstructionsState(VL[0], Opcode, false); - auto *OpInst = cast(VL[0]); + if (!Res.NeedAnalysis && !Res.HasAltOpcodes) + return InstructionsState(VL[Res.OpcodePos], Opcode, false); + auto *OpInst = cast(VL[Res.OpcodePos]); unsigned AltOpcode = getAltOpcode(Opcode); // Examine each element in the list instructions VL to determine // if some operations there could be considered as an alternative - // (for example as subtraction relates to addition operation). + // (for example as subtraction relates to addition operation) or + // operation could be an operand of a (possibly) binary operation. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); unsigned InstOpcode = I->getOpcode(); + if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode)) + if (tryToRepresentAsInstArg(InstOpcode, OpInst)) + InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode; if ((Res.HasAltOpcodes && InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || (!Res.HasAltOpcodes && InstOpcode != Opcode)) { @@ -574,6 +620,7 @@ void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); + ExtraScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumLoadsWantToKeepOrder = 0; @@ -713,22 +760,40 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - int &UserTreeIdx) { + int &UserTreeIdx, const InstructionsState &S) { + assert((!Vectorized || S.Opcode != 0) && + "Vectorized TreeEntry without opcode"); VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; if (Vectorized) { + Last->State = S; + unsigned AltOpcode = getAltOpcode(S.Opcode); for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + unsigned RealOpcode = + (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; + Value *Key = (cast(VL[i])->getOpcode() == RealOpcode) + ? VL[i] + : S.OpValue; + assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!"); + if (VL[i] == Key) + ScalarToTreeEntry[Key] = idx; + else + ExtraScalarToTreeEntry[VL[i]][Key] = idx; } } else { + Last->State.Opcode = 0; + Last->State.OpValue = VL[0]; + Last->State.IsAltShuffle = false; MustGather.insert(VL.begin(), VL.end()); } @@ -756,9 +821,25 @@ return nullptr; } + TreeEntry *getTreeEntry(Value *V, Value *OpValue) { + if (V == OpValue) + return getTreeEntry(V); + auto I = ExtraScalarToTreeEntry.find(V); + if (I != ExtraScalarToTreeEntry.end()) { + auto &STT = I->second; + auto STTI = STT.find(OpValue); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } + return nullptr; + } + /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Maps a specific scalar to its tree entry(s) with leading scalar. + SmallDenseMap> ExtraScalarToTreeEntry; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1327,9 +1408,15 @@ continue; // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) + continue; + // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { @@ -1372,6 +1459,38 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext())); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1379,28 +1498,28 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1412,7 +1531,7 @@ if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1423,7 +1542,7 @@ DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1442,7 +1561,7 @@ if (getTreeEntry(I)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1452,7 +1571,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1466,7 +1585,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1475,7 +1594,7 @@ for (unsigned j = i + 1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1490,11 +1609,12 @@ assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + unsigned AltOpcode = getAltOpcode(S.Opcode); unsigned ShuffleOrOp = S.IsAltShuffle ? (unsigned) Instruction::ShuffleVector : S.Opcode; switch (ShuffleOrOp) { @@ -1509,12 +1629,12 @@ if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1536,7 +1656,7 @@ } else { BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx); + newTreeEntry(VL, Reuse, UserTreeIdx, S); return; } case Instruction::Load: { @@ -1551,7 +1671,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1562,7 +1682,7 @@ LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1584,7 +1704,7 @@ if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1599,7 +1719,7 @@ } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1626,12 +1746,12 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1654,13 +1774,13 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1692,7 +1812,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1708,10 +1828,21 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.Opcode) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.Opcode) && + "Expected a binary operation."); + Value *Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : VecOp; + Operands.push_back(Operand); + } + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1721,7 +1852,7 @@ if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1734,7 +1865,7 @@ if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1746,12 +1877,12 @@ DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1768,12 +1899,12 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1791,7 +1922,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1805,7 +1936,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1816,7 +1947,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1829,14 +1960,14 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1853,11 +1984,11 @@ // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, S); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1872,8 +2003,19 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.Opcode) && + "Expected a binary operation."); + Value *Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : VecOp; + Operands.push_back(Operand); + } buildTree_rec(Operands, Depth + 1, UserTreeIdx); } @@ -1881,7 +2023,7 @@ default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2002,18 +2144,17 @@ } return getGatherCost(E->Scalars); } - InstructionsState S = getSameOpcode(VL); - assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + auto *VL0 = cast(E->State.OpValue); + unsigned ShuffleOrOp = E->State.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : E->State.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: return 0; case Instruction::ExtractValue: case Instruction::ExtractElement: - if (canReuseExtract(VL, S.OpValue)) { + if (canReuseExtract(VL, E->State.OpValue)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); @@ -2057,8 +2198,8 @@ // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); + TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -2084,7 +2225,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2095,34 +2236,33 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = E->State.Opcode; + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - SmallVector Operands(VL0->operand_values()); - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, - Op2VP, Operands); - int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP, Operands); + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, + Op1VK, Op2VK, Op1VP, Op2VP); + int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, + Op2VK, Op1VP, Op2VP); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2188,23 +2328,18 @@ TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; - int ScalarCost = 0; - int VecCost = 0; - for (Value *i : VL) { - Instruction *I = cast(i); - if (!I) - break; - ScalarCost += - TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } + unsigned AltOpcode = getAltOpcode(E->State.Opcode); + int ScalarCost = + TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK, Op2VK) * + VL.size() / 2; + ScalarCost += + TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK) * + VL.size() / 2; // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - Instruction *I0 = cast(VL[0]); - VecCost = - TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); - Instruction *I1 = cast(VL[1]); - VecCost += - TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, Op2VK); + VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); return VecCost - ScalarCost; @@ -2270,7 +2405,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -2330,7 +2465,7 @@ for (TreeEntry &TE : VectorizableTree) { int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *TE.Scalars[0] << ".\n"); + << *TE.State.OpValue << ".\n"); Cost += C; } @@ -2351,7 +2486,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2414,13 +2549,15 @@ SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right unsigned AltOpcode = getAltOpcode(Opcode); - (void)AltOpcode; for (Value *V : VL) { auto *I = cast(V); - assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && - "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2469,8 +2606,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2534,8 +2676,15 @@ // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2740,12 +2889,11 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(E->Scalars[0]); + Instruction *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -2758,8 +2906,8 @@ return V; } - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + unsigned ShuffleOrOp = E->State.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : E->State.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -2869,7 +3017,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.Opcode == Instruction::FCmp) + if (E->State.Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -2921,13 +3069,19 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, + reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); + if (I->getOpcode() == E->State.Opcode) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.Opcode, I->getType())); + } } setInsertPointAfterBundle(E->Scalars, VL0); @@ -2939,7 +3093,7 @@ return V; Value *V = Builder.CreateBinOp( - static_cast(S.Opcode), LHS, RHS); + static_cast(E->State.Opcode), LHS, RHS); E->VectorizedValue = V; propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; @@ -3089,9 +3243,9 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(Instruction::isBinaryOp(S.Opcode) && + assert(Instruction::isBinaryOp(E->State.Opcode) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); @@ -3102,9 +3256,9 @@ // Create a vector of LHS op1 RHS Value *V0 = Builder.CreateBinOp( - static_cast(S.Opcode), LHS, RHS); + static_cast(E->State.Opcode), LHS, RHS); - unsigned AltOpcode = getAltOpcode(S.Opcode); + unsigned AltOpcode = getAltOpcode(E->State.Opcode); // Create a vector of LHS op2 RHS Value *V1 = Builder.CreateBinOp( static_cast(AltOpcode), LHS, RHS); @@ -3126,8 +3280,13 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, EvenScalars); - propagateIRFlags(V1, OddScalars); + InstructionsState S = getSameOpcode(EvenScalars); + assert(!S.IsAltShuffle && "Unexpected alternate opcode"); + propagateIRFlags(V0, EvenScalars, S.OpValue); + + S = getSameOpcode(OddScalars); + assert(!S.IsAltShuffle && "Unexpected alternate opcode"); + propagateIRFlags(V1, OddScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); E->VectorizedValue = V; @@ -3161,7 +3320,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3272,9 +3431,15 @@ assert(Entry->VectorizedValue && "Can't find vectorizable value"); // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) + continue; + Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3406,7 +3571,7 @@ } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + ScheduleData *BundleMember = getScheduleData(V, isOneOf(OpValue, V)); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { @@ -3467,6 +3632,25 @@ schedule(pickedSD, ReadyInsts); } } + + // Avoid any vectors here that are wider than 2 elements and + // with just one real operation and others members are operands + // for such operations. + if (VL.size() > 2) { + unsigned SameOrAlt = 0; + Instruction *OpInstr = cast(OpValue); + const unsigned AltOpcode = getAltOpcode(OpInstr->getOpcode()); + for (Value *V : VL) { + Instruction *Instr = cast(V); + if (sameOpcodeOrAlt(OpInstr->getOpcode(), AltOpcode, Instr->getOpcode())) + SameOrAlt++; + } + if (SameOrAlt == 1) { + cancelScheduling(VL, OpValue); + return false; + } + } + if (!Bundle->isReady()) { cancelScheduling(VL, OpValue); return false; @@ -3479,7 +3663,7 @@ if (isa(OpValue)) return; - ScheduleData *Bundle = getScheduleData(OpValue); + ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3782,7 +3966,7 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -3805,15 +3989,15 @@ ScheduleData *BundleMember = picked; while (BundleMember) { Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + if (pickedInst == BundleMember->OpValue) { + if (LastScheduledInst->getNextNode() != pickedInst) { + BS->BB->getInstList().remove(pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); + } + LastScheduledInst = pickedInst; } - LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; } - BS->schedule(picked, ReadyInsts); NumToSchedule--; } Index: test/Transforms/SLPVectorizer/SystemZ/pr34619.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer +%struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i8**, i32, i32***, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], i32****, i32***, i32***, i32***, i32****, i32****, %struct.Picture.8.128.152.320.488.896.920.944.968.992.1040.1064.1088.1112.1136.1184.1232.1304.1328.1352.1376.1592.1688.1880.1904.2024.2096.2384.2408.2504.2528.2768.3176.3200.3224.3248.3272.3440.3464.3680.4112.4376.4856.6055.7.18.29.40.51.546.557*, %struct.Slice.7.127.151.319.487.895.919.943.967.991.1039.1063.1087.1111.1135.1183.1231.1303.1327.1351.1375.1591.1687.1879.1903.2023.2095.2383.2407.2503.2527.2767.3175.3199.3223.3247.3271.3439.3463.3679.4111.4375.4855.6054.6.17.28.39.50.545.556*, %struct.macroblock.9.129.153.321.489.897.921.945.969.993.1041.1065.1089.1113.1137.1185.1233.1305.1329.1353.1377.1593.1689.1881.1905.2025.2097.2385.2409.2505.2529.2769.3177.3201.3225.3249.3273.3441.3465.3681.4113.4377.4857.6056.8.19.30.41.52.547.558*, i32*, i32*, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i16******, i16******, i16******, i16******, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [32 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s.10.130.154.322.490.898.922.946.970.994.1042.1066.1090.1114.1138.1186.1234.1306.1330.1354.1378.1594.1690.1882.1906.2026.2098.2386.2410.2506.2530.2770.3178.3202.3226.3250.3274.3442.3466.3682.4114.4378.4858.6057.9.20.31.42.53.548.559*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double**, double***, i32***, double**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [2 x i32], i32, i32, i16, i32, i32, i32, i32, i32 } +%struct.Picture.8.128.152.320.488.896.920.944.968.992.1040.1064.1088.1112.1136.1184.1232.1304.1328.1352.1376.1592.1688.1880.1904.2024.2096.2384.2408.2504.2528.2768.3176.3200.3224.3248.3272.3440.3464.3680.4112.4376.4856.6055.7.18.29.40.51.546.557 = type { i32, i32, [100 x %struct.Slice.7.127.151.319.487.895.919.943.967.991.1039.1063.1087.1111.1135.1183.1231.1303.1327.1351.1375.1591.1687.1879.1903.2023.2095.2383.2407.2503.2527.2767.3175.3199.3223.3247.3271.3439.3463.3679.4111.4375.4855.6054.6.17.28.39.50.545.556*], i32, float, float, float } +%struct.Slice.7.127.151.319.487.895.919.943.967.991.1039.1063.1087.1111.1135.1183.1231.1303.1327.1351.1375.1591.1687.1879.1903.2023.2095.2383.2407.2503.2527.2767.3175.3199.3223.3247.3271.3439.3463.3679.4111.4375.4855.6054.6.17.28.39.50.545.556 = type { i32, i32, i32, i32, i32, i32, %struct.datapartition.3.123.147.315.483.891.915.939.963.987.1035.1059.1083.1107.1131.1179.1227.1299.1323.1347.1371.1587.1683.1875.1899.2019.2091.2379.2403.2499.2523.2763.3171.3195.3219.3243.3267.3435.3459.3675.4107.4371.4851.6050.2.13.24.35.46.541.552*, %struct.MotionInfoContexts.5.125.149.317.485.893.917.941.965.989.1037.1061.1085.1109.1133.1181.1229.1301.1325.1349.1373.1589.1685.1877.1901.2021.2093.2381.2405.2501.2525.2765.3173.3197.3221.3245.3269.3437.3461.3677.4109.4373.4853.6052.4.15.26.37.48.543.554*, %struct.TextureInfoContexts.6.126.150.318.486.894.918.942.966.990.1038.1062.1086.1110.1134.1182.1230.1302.1326.1350.1374.1590.1686.1878.1902.2022.2094.2382.2406.2502.2526.2766.3174.3198.3222.3246.3270.3438.3462.3678.4110.4374.4854.6053.5.16.27.38.49.544.555*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (i32)*, [3 x [2 x i32]] } +%struct.datapartition.3.123.147.315.483.891.915.939.963.987.1035.1059.1083.1107.1131.1179.1227.1299.1323.1347.1371.1587.1683.1875.1899.2019.2091.2379.2403.2499.2523.2763.3171.3195.3219.3243.3267.3435.3459.3675.4107.4371.4851.6050.2.13.24.35.46.541.552 = type { %struct.Bitstream.1.121.145.313.481.889.913.937.961.985.1033.1057.1081.1105.1129.1177.1225.1297.1321.1345.1369.1585.1681.1873.1897.2017.2089.2377.2401.2497.2521.2761.3169.3193.3217.3241.3265.3433.3457.3673.4105.4369.4849.6048.0.11.22.33.44.539.550*, %struct.EncodingEnvironment.2.122.146.314.482.890.914.938.962.986.1034.1058.1082.1106.1130.1178.1226.1298.1322.1346.1370.1586.1682.1874.1898.2018.2090.2378.2402.2498.2522.2762.3170.3194.3218.3242.3266.3434.3458.3674.4106.4370.4850.6049.1.12.23.34.45.540.551, %struct.EncodingEnvironment.2.122.146.314.482.890.914.938.962.986.1034.1058.1082.1106.1130.1178.1226.1298.1322.1346.1370.1586.1682.1874.1898.2018.2090.2378.2402.2498.2522.2762.3170.3194.3218.3242.3266.3434.3458.3674.4106.4370.4850.6049.1.12.23.34.45.540.551 } +%struct.Bitstream.1.121.145.313.481.889.913.937.961.985.1033.1057.1081.1105.1129.1177.1225.1297.1321.1345.1369.1585.1681.1873.1897.2017.2089.2377.2401.2497.2521.2761.3169.3193.3217.3241.3265.3433.3457.3673.4105.4369.4849.6048.0.11.22.33.44.539.550 = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 } +%struct.EncodingEnvironment.2.122.146.314.482.890.914.938.962.986.1034.1058.1082.1106.1130.1178.1226.1298.1322.1346.1370.1586.1682.1874.1898.2018.2090.2378.2402.2498.2522.2762.3170.3194.3218.3242.3266.3434.3458.3674.4106.4370.4850.6049.1.12.23.34.45.540.551 = type { i32, i32, i32, i32, i32, i8*, i32*, i32, i32 } +%struct.MotionInfoContexts.5.125.149.317.485.893.917.941.965.989.1037.1061.1085.1109.1133.1181.1229.1301.1325.1349.1373.1589.1685.1877.1901.2021.2093.2381.2405.2501.2525.2765.3173.3197.3221.3245.3269.3437.3461.3677.4109.4373.4853.6052.4.15.26.37.48.543.554 = type { [3 x [11 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [2 x [9 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [2 x [10 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [2 x [6 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [4 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553], [4 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553], [3 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553] } +%struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553 = type { i16, i8, i64 } +%struct.TextureInfoContexts.6.126.150.318.486.894.918.942.966.990.1038.1062.1086.1110.1134.1182.1230.1302.1326.1350.1374.1590.1686.1878.1902.2022.2094.2382.2406.2502.2526.2766.3174.3198.3222.3246.3270.3438.3462.3678.4110.4374.4854.6053.5.16.27.38.49.544.555 = type { [2 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553], [4 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553], [3 x [4 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [4 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [15 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [15 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [5 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [5 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [15 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]], [10 x [15 x %struct.BiContextType.4.124.148.316.484.892.916.940.964.988.1036.1060.1084.1108.1132.1180.1228.1300.1324.1348.1372.1588.1684.1876.1900.2020.2092.2380.2404.2500.2524.2764.3172.3196.3220.3244.3268.3436.3460.3676.4108.4372.4852.6051.3.14.25.36.47.542.553]] } +%struct.macroblock.9.129.153.321.489.897.921.945.969.993.1041.1065.1089.1113.1137.1185.1233.1305.1329.1353.1377.1593.1689.1881.1905.2025.2097.2385.2409.2505.2529.2769.3177.3201.3225.3249.3273.3441.3465.3681.4113.4377.4857.6056.8.19.30.41.52.547.558 = type { i32, i32, i32, [2 x i32], i32, [8 x i32], %struct.macroblock.9.129.153.321.489.897.921.945.969.993.1041.1065.1089.1113.1137.1185.1233.1305.1329.1353.1377.1593.1689.1881.1905.2025.2097.2385.2409.2505.2529.2769.3177.3201.3225.3249.3273.3441.3465.3681.4113.4377.4857.6056.8.19.30.41.52.547.558*, %struct.macroblock.9.129.153.321.489.897.921.945.969.993.1041.1065.1089.1113.1137.1185.1233.1305.1329.1353.1377.1593.1689.1881.1905.2025.2097.2385.2409.2505.2529.2769.3177.3201.3225.3249.3273.3441.3465.3681.4113.4377.4857.6056.8.19.30.41.52.547.558*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 } +%struct.DecRefPicMarking_s.10.130.154.322.490.898.922.946.970.994.1042.1066.1090.1114.1138.1186.1234.1306.1330.1354.1378.1594.1690.1882.1906.2026.2098.2386.2410.2506.2530.2770.3178.3202.3226.3250.3274.3442.3466.3682.4114.4378.4858.6057.9.20.31.42.53.548.559 = type { i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s.10.130.154.322.490.898.922.946.970.994.1042.1066.1090.1114.1138.1186.1234.1306.1330.1354.1378.1594.1690.1882.1906.2026.2098.2386.2410.2506.2530.2770.3178.3202.3226.3250.3274.3442.3466.3682.4114.4378.4858.6057.9.20.31.42.53.548.559* } + +@dct_luma.m4 = external global [4 x [4 x i32]], align 4 + +define void @dct_luma() local_unnamed_addr { +entry: + %add277 = add nsw i32 undef, undef + store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma.m4, i64 0, i64 3, i64 1), align 4 + %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma.m4, i64 0, i64 3, i64 0), align 4 + %sub355 = add nsw i32 undef, %0 + %shr.i = ashr i32 %sub355, 6 + %arrayidx372 = getelementptr inbounds %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560, %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560* undef, i64 0, i32 52, i64 2, i64 0 + store i32 %shr.i, i32* %arrayidx372, align 4 + %sub355.1 = add nsw i32 undef, %add277 + %shr.i.1 = ashr i32 %sub355.1, 6 + %arrayidx372.1 = getelementptr inbounds %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560, %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560* undef, i64 0, i32 52, i64 2, i64 1 + store i32 %shr.i.1, i32* %arrayidx372.1, align 4 + %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma.m4, i64 0, i64 3, i64 2), align 4 + %sub355.2 = add nsw i32 undef, %1 + %shr.i.2 = ashr i32 %sub355.2, 6 + %arrayidx372.2 = getelementptr inbounds %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560, %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560* undef, i64 0, i32 52, i64 2, i64 2 + store i32 %shr.i.2, i32* %arrayidx372.2, align 4 + %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma.m4, i64 0, i64 3, i64 3), align 4 + %sub355.3 = add nsw i32 undef, %2 + %shr.i.3 = ashr i32 %sub355.3, 6 + %arrayidx372.3 = getelementptr inbounds %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560, %struct.ImageParameters.11.131.155.323.491.899.923.947.971.995.1043.1067.1091.1115.1139.1187.1235.1307.1331.1355.1379.1595.1691.1883.1907.2027.2099.2387.2411.2507.2531.2771.3179.3203.3227.3251.3275.3443.3467.3683.4115.4379.4859.6058.10.21.32.43.54.549.560* undef, i64 0, i32 52, i64 2, i64 3 + store i32 %shr.i.3, i32* %arrayidx372.3, align 4 + unreachable +} Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: