Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -2237,6 +2237,12 @@ return Mask; } + /// Determine if the shuffle mask is a splat, possibly with undefined mask + /// indices as well. Returns true if the same shuffle index is found in all + /// defined elements and optionally returns the splat index. Returns false + /// if the mask is not a splat or all mask indices are undefined. + bool isSplat(int *SplatIndex = nullptr) const; + // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Instruction *I) { return I->getOpcode() == Instruction::ShuffleVector; Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -495,6 +495,10 @@ SmallVector Mask = Shuffle->getShuffleMask(); if (NumVecElems == Mask.size()) { + int BroadcastIndex = -1; + if (Shuffle->isSplat(&BroadcastIndex) && BroadcastIndex == 0) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); if (isReverseVectorMask(Mask)) return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0, nullptr); Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -4389,7 +4389,7 @@ I->insertAfter(LI); // CGP does not check if the zext would be speculatively executed when moved // to the same basic block as the load. Preserving its original location would - // pessimize the debugging experience, as well as negatively impact the + // pessimize the debugging experience, as well as negatively impact the // quality of sample pgo. We don't want to use "line 0" as that has a // size cost in the line-table section and logically the zext can be seen as // part of the load. Therefore we conservatively reuse the same debug location @@ -4881,18 +4881,6 @@ return true; } -static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { - SmallVector Mask(SVI->getShuffleMask()); - int SplatElem = -1; - for (unsigned i = 0; i < Mask.size(); ++i) { - if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) - return false; - SplatElem = Mask[i]; - } - - return true; -} - /// Some targets have expensive vector shifts if the lanes aren't all the same /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that @@ -4906,7 +4894,7 @@ // We only expect better codegen by sinking a shuffle if we can recognise a // constant splat. - if (!isBroadcastShuffle(SVI)) + if (!SVI->isSplat()) return false; // InsertedShuffles - Only insert a shuffle in each block once. Index: lib/IR/Instructions.cpp =================================================================== --- lib/IR/Instructions.cpp +++ lib/IR/Instructions.cpp @@ -258,7 +258,7 @@ "Calling a function with bad signature!"); for (unsigned i = 0; i != Args.size(); ++i) - assert((i >= FTy->getNumParams() || + assert((i >= FTy->getNumParams() || FTy->getParamType(i) == Args[i]->getType()) && "Calling a function with a bad signature!"); #endif @@ -340,7 +340,7 @@ if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) && Index) return getArgOperand(Index-1); - + return nullptr; } @@ -424,7 +424,7 @@ assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) && "createMalloc needs either InsertBefore or InsertAtEnd"); - // malloc(type) becomes: + // malloc(type) becomes: // bitcast (i8* malloc(typeSize)) to type* // malloc(type, arraySize) becomes: // bitcast (i8* malloc(typeSize*arraySize)) to type* @@ -531,7 +531,7 @@ /// responsibility of the caller. Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd, Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize, + Value *AllocSize, Value *ArraySize, Function *MallocF, const Twine &Name) { return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize, ArraySize, None, MallocF, Name); @@ -627,7 +627,7 @@ "Invoking a function with bad signature"); for (unsigned i = 0, e = Args.size(); i != e; i++) - assert((i >= FTy->getNumParams() || + assert((i >= FTy->getNumParams() || FTy->getParamType(i) == Args[i]->getType()) && "Invoking a function with a bad signature!"); #endif @@ -687,7 +687,7 @@ if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) && Index) return getArgOperand(Index-1); - + return nullptr; } @@ -1075,7 +1075,7 @@ // UnreachableInst Implementation //===----------------------------------------------------------------------===// -UnreachableInst::UnreachableInst(LLVMContext &Context, +UnreachableInst::UnreachableInst(LLVMContext &Context, Instruction *InsertBefore) : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable, nullptr, 0, InsertBefore) { @@ -1258,7 +1258,7 @@ bool AllocaInst::isStaticAlloca() const { // Must be constant size. if (!isa(getArraySize())) return false; - + // Must be in the entry block. const BasicBlock *Parent = getParent(); return Parent == &Parent->getParent()->front() && !isUsedWithInAlloca(); @@ -1311,7 +1311,7 @@ setName(Name); } -LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, +LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, unsigned Align, AtomicOrdering Order, SynchronizationScope SynchScope, BasicBlock *InsertAE) @@ -1568,7 +1568,7 @@ // FenceInst Implementation //===----------------------------------------------------------------------===// -FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, +FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, SynchronizationScope SynchScope, Instruction *InsertBefore) : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) { @@ -1576,7 +1576,7 @@ setSynchScope(SynchScope); } -FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, +FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, SynchronizationScope SynchScope, BasicBlock *InsertAtEnd) : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertAtEnd) { @@ -1767,14 +1767,14 @@ setName(Name); } -bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, +bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, const Value *Index) { if (!Vec->getType()->isVectorTy()) return false; // First operand of insertelement must be vector type. - + if (Elt->getType() != cast(Vec->getType())->getElementType()) return false;// Second operand of insertelement must be vector element type. - + if (!Index->getType()->isIntegerTy()) return false; // Third operand of insertelement must be i32. return true; @@ -1825,7 +1825,7 @@ // V1 and V2 must be vectors of the same type. if (!V1->getType()->isVectorTy() || V1->getType() != V2->getType()) return false; - + // Mask must be vector of i32. VectorType *MaskTy = dyn_cast(Mask->getType()); if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32)) @@ -1847,7 +1847,7 @@ } return true; } - + if (const ConstantDataSequential *CDS = dyn_cast(Mask)) { unsigned V1Size = cast(V1->getType())->getNumElements(); @@ -1856,7 +1856,7 @@ return false; return true; } - + // The bitcode reader can create a place holder for a forward reference // used as the shuffle mask. When this occurs, the shuffle mask will // fall into this case and fail. To avoid this error, do this bit of @@ -1881,12 +1881,12 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask, SmallVectorImpl &Result) { unsigned NumElts = Mask->getType()->getVectorNumElements(); - + if (ConstantDataSequential *CDS=dyn_cast(Mask)) { for (unsigned i = 0; i != NumElts; ++i) Result.push_back(CDS->getElementAsInteger(i)); return; - } + } for (unsigned i = 0; i != NumElts; ++i) { Constant *C = Mask->getAggregateElement(i); Result.push_back(isa(C) ? -1 : @@ -1894,12 +1894,29 @@ } } +bool ShuffleVectorInst::isSplat(int *SplatIndex /* = nullptr */) const { + SmallVector Mask; + getShuffleMask(Mask); + + int SplatElem = -1; + for (int M : Mask) { + if (M < 0) + continue; + if (0 <= SplatElem && SplatElem != M) + return false; + SplatElem = M; + } + + if (0 <= SplatElem && SplatIndex) + *SplatIndex = SplatElem; + return (0 <= SplatElem); +} //===----------------------------------------------------------------------===// // InsertValueInst Class //===----------------------------------------------------------------------===// -void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef Idxs, +void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name) { assert(getNumOperands() == 2 && "NumOperands not initialized?"); @@ -1996,7 +2013,7 @@ setName(Name); } -BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, +BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd) : Instruction(Ty, iType, @@ -2032,11 +2049,11 @@ "Tried to create a floating-point operation on a " "non-floating-point type!"); break; - case UDiv: - case SDiv: + case UDiv: + case SDiv: assert(getType() == LHS->getType() && "Arithmetic operation should return same type as operands!"); - assert((getType()->isIntegerTy() || (getType()->isVectorTy() && + assert((getType()->isIntegerTy() || (getType()->isVectorTy() && cast(getType())->getElementType()->isIntegerTy())) && "Incorrect operand type (not integer) for S/UDIV"); break; @@ -2046,11 +2063,11 @@ assert(getType()->isFPOrFPVectorTy() && "Incorrect operand type (not floating point) for FDIV"); break; - case URem: - case SRem: + case URem: + case SRem: assert(getType() == LHS->getType() && "Arithmetic operation should return same type as operands!"); - assert((getType()->isIntegerTy() || (getType()->isVectorTy() && + assert((getType()->isIntegerTy() || (getType()->isVectorTy() && cast(getType())->getElementType()->isIntegerTy())) && "Incorrect operand type (not integer) for S/UREM"); break; @@ -2066,7 +2083,7 @@ assert(getType() == LHS->getType() && "Shift operation should return same type as operands!"); assert((getType()->isIntegerTy() || - (getType()->isVectorTy() && + (getType()->isVectorTy() && cast(getType())->getElementType()->isIntegerTy())) && "Tried to create a shift operation on a non-integral type!"); break; @@ -2075,7 +2092,7 @@ assert(getType() == LHS->getType() && "Logical operation should return same type as operands!"); assert((getType()->isIntegerTy() || - (getType()->isVectorTy() && + (getType()->isVectorTy() && cast(getType())->getElementType()->isIntegerTy())) && "Tried to create a logical operation on a non-integral type!"); break; @@ -2292,7 +2309,7 @@ Type *DstTy = getType(); if (SrcTy == DstTy) return true; - + // Pointer to pointer is always lossless. if (SrcTy->isPointerTy()) return DstTy->isPointerTy(); @@ -2301,10 +2318,10 @@ /// This function determines if the CastInst does not require any bits to be /// changed in order to effect the cast. Essentially, it identifies cases where -/// no code gen is necessary for the cast, hence the name no-op cast. For +/// no code gen is necessary for the cast, hence the name no-op cast. For /// example, the following are all no-op casts: /// # bitcast i32* %x to i8* -/// # bitcast <2 x i32> %x to <4 x i16> +/// # bitcast <2 x i32> %x to <4 x i16> /// # ptrtoint i32* %x to i32 ; on 32-bit plaforms only /// @brief Determine if the described cast is a no-op. bool CastInst::isNoopCast(Instruction::CastOps Opcode, @@ -2315,7 +2332,7 @@ default: llvm_unreachable("Invalid CastOp"); case Instruction::Trunc: case Instruction::ZExt: - case Instruction::SExt: + case Instruction::SExt: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::UIToFP: @@ -2368,7 +2385,7 @@ Type *DstIntPtrTy) { // Define the 144 possibilities for these two cast instructions. The values // in this matrix determine what to do in a given situation and select the - // case in the switch below. The rows correspond to firstOp, the columns + // case in the switch below. The rows correspond to firstOp, the columns // correspond to secondOp. In looking at the table below, keep in mind // the following cast properties: // @@ -2436,16 +2453,16 @@ int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin] [secondOp-Instruction::CastOpsBegin]; switch (ElimCase) { - case 0: + case 0: // Categorically disallowed. return 0; - case 1: + case 1: // Allowed, use first cast's opcode. return firstOp; - case 2: + case 2: // Allowed, use second cast's opcode. return secondOp; - case 3: + case 3: // No-op cast in second op implies firstOp as long as the DestTy // is integer and we are not converting between a vector and a // non-vector type. @@ -2458,7 +2475,7 @@ if (DstTy->isFloatingPointTy()) return firstOp; return 0; - case 5: + case 5: // No-op cast in first op implies secondOp as long as the SrcTy // is an integer. if (SrcTy->isIntegerTy()) @@ -2578,7 +2595,7 @@ case 17: // (sitofp (zext x)) -> (uitofp x) return Instruction::UIToFP; - case 99: + case 99: // Cast combination can't happen (error in input). This is for all cases // where the MidTy is not the same for the two cast instructions. llvm_unreachable("Invalid Cast Combination"); @@ -2587,7 +2604,7 @@ } } -CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, +CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore) { assert(castIsValid(op, S, Ty) && "Invalid cast!"); // Construct and return the appropriate CastInst subclass @@ -2631,7 +2648,7 @@ } } -CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, +CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore) { if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) @@ -2639,7 +2656,7 @@ return Create(Instruction::ZExt, S, Ty, Name, InsertBefore); } -CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, +CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd) { if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) @@ -2647,7 +2664,7 @@ return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd); } -CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, +CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore) { if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) @@ -2655,7 +2672,7 @@ return Create(Instruction::SExt, S, Ty, Name, InsertBefore); } -CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, +CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd) { if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) @@ -2672,7 +2689,7 @@ } CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty, - const Twine &Name, + const Twine &Name, BasicBlock *InsertAtEnd) { if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd); @@ -2765,7 +2782,7 @@ return Create(opcode, C, Ty, Name, InsertBefore); } -CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, +CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, bool isSigned, const Twine &Name, BasicBlock *InsertAtEnd) { assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() && @@ -2779,8 +2796,8 @@ return Create(opcode, C, Ty, Name, InsertAtEnd); } -CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, - const Twine &Name, +CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, + const Twine &Name, Instruction *InsertBefore) { assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() && "Invalid cast"); @@ -2792,8 +2809,8 @@ return Create(opcode, C, Ty, Name, InsertBefore); } -CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, - const Twine &Name, +CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, + const Twine &Name, BasicBlock *InsertAtEnd) { assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() && "Invalid cast"); @@ -2836,7 +2853,7 @@ return DestBits == SrcBits; // Casting from something else return SrcTy->isPointerTy(); - } + } if (DestTy->isFloatingPointTy()) { // Casting to floating pt if (SrcTy->isIntegerTy()) // Casting from integral return true; @@ -2853,7 +2870,7 @@ if (SrcTy->isPointerTy()) // Casting from pointer return true; return SrcTy->isIntegerTy(); // Casting from integral - } + } if (DestTy->isX86_MMXTy()) { if (SrcTy->isVectorTy()) return DestBits == SrcBits; // 64-bit vector to MMX @@ -2960,10 +2977,10 @@ return BitCast; // Same size, No-op cast } } else if (SrcTy->isFloatingPointTy()) { // Casting from floating pt - if (DestIsSigned) + if (DestIsSigned) return FPToSI; // FP -> sint else - return FPToUI; // FP -> uint + return FPToUI; // FP -> uint } else if (SrcTy->isVectorTy()) { assert(DestBits == SrcBits && "Casting vector to integer of different width"); @@ -3024,7 +3041,7 @@ /// could be broken out into the separate constructors but it is useful to have /// it in one place and to eliminate the redundant code for getting the sizes /// of the types involved. -bool +bool CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) { // Check for type sanity on the arguments @@ -3055,7 +3072,7 @@ case Instruction::ZExt: return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() && SrcLength == DstLength && SrcBitSize < DstBitSize; - case Instruction::SExt: + case Instruction::SExt: return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() && SrcLength == DstLength && SrcBitSize < DstBitSize; case Instruction::FPTrunc: @@ -3148,138 +3165,138 @@ TruncInst::TruncInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { +) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc"); } ZExtInst::ZExtInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, ZExt, S, Name, InsertBefore) { +) : CastInst(Ty, ZExt, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt"); } ZExtInst::ZExtInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { +) : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt"); } SExtInst::SExtInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, SExt, S, Name, InsertBefore) { +) : CastInst(Ty, SExt, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt"); } SExtInst::SExtInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, SExt, S, Name, InsertAtEnd) { +) : CastInst(Ty, SExt, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt"); } FPTruncInst::FPTruncInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { +) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc"); } FPTruncInst::FPTruncInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { +) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc"); } FPExtInst::FPExtInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, FPExt, S, Name, InsertBefore) { +) : CastInst(Ty, FPExt, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt"); } FPExtInst::FPExtInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { +) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt"); } UIToFPInst::UIToFPInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { +) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP"); } UIToFPInst::UIToFPInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { +) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP"); } SIToFPInst::SIToFPInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { +) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP"); } SIToFPInst::SIToFPInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { +) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP"); } FPToUIInst::FPToUIInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { +) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI"); } FPToUIInst::FPToUIInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { +) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI"); } FPToSIInst::FPToSIInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { +) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI"); } FPToSIInst::FPToSIInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { +) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI"); } PtrToIntInst::PtrToIntInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { +) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt"); } PtrToIntInst::PtrToIntInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { +) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt"); } IntToPtrInst::IntToPtrInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { +) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr"); } IntToPtrInst::IntToPtrInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { +) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr"); } BitCastInst::BitCastInst( Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore -) : CastInst(Ty, BitCast, S, Name, InsertBefore) { +) : CastInst(Ty, BitCast, S, Name, InsertBefore) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast"); } BitCastInst::BitCastInst( Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd -) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { +) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast"); } @@ -3336,7 +3353,7 @@ return new ICmpInst(CmpInst::Predicate(predicate), S1, S2, Name); } - + if (InsertBefore) return new FCmpInst(InsertBefore, CmpInst::Predicate(predicate), S1, S2, Name); @@ -3446,8 +3463,8 @@ ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) { switch (pred) { default: llvm_unreachable("Unknown icmp predicate!"); - case ICMP_EQ: case ICMP_NE: - case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: + case ICMP_EQ: case ICMP_NE: + case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: return pred; case ICMP_UGT: return ICMP_SGT; case ICMP_ULT: return ICMP_SLT; @@ -3459,8 +3476,8 @@ ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) { switch (pred) { default: llvm_unreachable("Unknown icmp predicate!"); - case ICMP_EQ: case ICMP_NE: - case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: + case ICMP_EQ: case ICMP_NE: + case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: return pred; case ICMP_SGT: return ICMP_UGT; case ICMP_SLT: return ICMP_ULT; @@ -3482,7 +3499,7 @@ case ICMP_ULT: return ICMP_UGT; case ICMP_UGE: return ICMP_ULE; case ICMP_ULE: return ICMP_UGE; - + case FCMP_FALSE: case FCMP_TRUE: case FCMP_OEQ: case FCMP_ONE: case FCMP_UEQ: case FCMP_UNE: @@ -3519,7 +3536,7 @@ bool CmpInst::isUnsigned(Predicate predicate) { switch (predicate) { default: return false; - case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return true; } } @@ -3527,7 +3544,7 @@ bool CmpInst::isSigned(Predicate predicate) { switch (predicate) { default: return false; - case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: return true; } } @@ -3535,17 +3552,17 @@ bool CmpInst::isOrdered(Predicate predicate) { switch (predicate) { default: return false; - case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: - case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE: + case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: + case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE: case FCmpInst::FCMP_ORD: return true; } } - + bool CmpInst::isUnordered(Predicate predicate) { switch (predicate) { default: return false; - case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: - case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE: + case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: + case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE: case FCmpInst::FCMP_UNO: return true; } } @@ -3664,7 +3681,7 @@ /// from the switch instruction. void SwitchInst::removeCase(CaseIt i) { unsigned idx = i.getCaseIndex(); - + assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!"); unsigned NumOps = getNumOperands(); @@ -3725,7 +3742,7 @@ void IndirectBrInst::growOperands() { unsigned e = getNumOperands(); unsigned NumOps = e*2; - + ReservedSpace = NumOps; growHungoffUses(ReservedSpace); } @@ -3771,13 +3788,13 @@ /// indirectbr instruction. void IndirectBrInst::removeDestination(unsigned idx) { assert(idx < getNumOperands()-1 && "Successor index out of range!"); - + unsigned NumOps = getNumOperands(); Use *OL = getOperandList(); // Replace this value with the last one. OL[idx+1] = OL[NumOps-1]; - + // Nuke the last value. OL[NumOps-1].set(nullptr); setNumHungOffUseOperands(NumOps-1); @@ -3840,7 +3857,7 @@ StoreInst *StoreInst::cloneImpl() const { return new StoreInst(getOperand(0), getOperand(1), isVolatile(), getAlignment(), getOrdering(), getSynchScope()); - + } AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const { Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,9 +598,93 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (Kind == TTI::SK_Broadcast) { + // Broadcast is a special case where when we've legalized to multiple + // registers we will just repeatedly use the first register. + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + static const CostTblEntry AVX512BWShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpbroadcastw + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 } // vpbroadcastb + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vbroadcastpd + { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vbroadcastps + { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpbroadcastq + { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 } // vpbroadcastd + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vbroadcastpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vbroadcastps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpbroadcastq + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpbroadcastd + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpbroadcastw + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpbroadcastb + }; + + if (ST->hasAVX2()) + if (const auto *Entry = + CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 3 }, // pshuflw + pshufd + vinsertf128 + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // pshufb + vinsertf128 + }; + + if (ST->hasAVX()) + if (const auto *Entry = + CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSSE3ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = + CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSE2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd + { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, // pshuflw + pshufd + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 3 } // unpck + pshuflw + pshufd + }; + + if (ST->hasSSE2()) + if (const auto *Entry = + CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSE1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps + }; + + if (ST->hasSSE1()) + if (const auto *Entry = + CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + } if (Kind == TTI::SK_Reverse) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -792,7 +876,6 @@ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); Index: test/Analysis/CostModel/X86/shuffle-broadcast.ll =================================================================== --- test/Analysis/CostModel/X86/shuffle-broadcast.ll +++ test/Analysis/CostModel/X86/shuffle-broadcast.ll @@ -9,23 +9,161 @@ ; ; Verify the cost model for broadcast shuffles. ; +; Broadcast is special in that after legalization we can reuse the first broadcasted register. +; ; CHECK-LABEL: 'test_vXf64' define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { - ; SSE: Unknown cost {{.*}} %V128 = shufflevector - ; AVX: Unknown cost {{.*}} %V128 = shufflevector - ; AVX512: Unknown cost {{.*}} %V128 = shufflevector + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V256 = shufflevector - ; AVX: Unknown cost {{.*}} %V256 = shufflevector - ; AVX512: Unknown cost {{.*}} %V256 = shufflevector + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V512 = shufflevector - ; AVX: Unknown cost {{.*}} %V512 = shufflevector - ; AVX512: Unknown cost {{.*}} %V512 = shufflevector + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer ret void } + +; CHECK-LABEL: 'test_vXi64' +define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi32' +define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) { + ; SSE2: cost of 2 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 3 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { + ; SSE2: cost of 3 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer + + ret void +}