diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6699,6 +6699,7 @@ /// in MVE takes a GPR (integer) register, and the instruction that incorporate /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register. bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { + // Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), m_Undef(), m_ZeroMask()))) return false; @@ -6718,9 +6719,7 @@ Builder.SetInsertPoint(SVI); Value *BC1 = Builder.CreateBitCast( cast(SVI->getOperand(0))->getOperand(1), NewType); - Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1, - (uint64_t)0); - Value *Shuffle = Builder.CreateShuffleVector(Insert, SVI->getShuffleMask()); + Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1); Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); SVI->replaceAllUsesWith(BC2); diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -22,8 +22,8 @@ // // E.g. An interleaved load (Factor = 2): // %wide.vec = load <8 x i32>, <8 x i32>* %ptr -// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> -// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> +// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> +// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> // // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2 // intrinsic in ARM backend. @@ -351,6 +351,7 @@ Index)) return false; + assert(Shuffle->getShuffleMask().size() <= NumLoadElements); Indices.push_back(Index); } for (auto *Shuffle : BinOpShuffles) { @@ -360,6 +361,8 @@ Index)) return false; + assert(Shuffle->getShuffleMask().size() <= NumLoadElements); + if (cast(Shuffle->getOperand(0))->getOperand(0) == LI) Indices.push_back(Index); if (cast(Shuffle->getOperand(0))->getOperand(1) == LI) @@ -394,13 +397,17 @@ SmallVectorImpl &Shuffles, LoadInst *LI) { for (auto *SVI : BinOpShuffles) { BinaryOperator *BI = cast(SVI->getOperand(0)); + Type *BIOp0Ty = BI->getOperand(0)->getType(); ArrayRef Mask = SVI->getShuffleMask(); + assert(all_of(Mask, [&](int Idx) { + return Idx < (int)cast(BIOp0Ty)->getNumElements(); + })); - auto *NewSVI1 = new ShuffleVectorInst( - BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask, - SVI->getName(), SVI); + auto *NewSVI1 = + new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty), + Mask, SVI->getName(), SVI); auto *NewSVI2 = new ShuffleVectorInst( - BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask, + BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask, SVI->getName(), SVI); Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2, BI->getName(), SVI); diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @shuffle_binop_fol( ; CHECK-NEXT: vector.body.preheader: ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[PTR:%.*]], align 8 -; CHECK-NEXT: [[EXTRACTED1:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> undef, <2 x i32> -; CHECK-NEXT: [[EXTRACTED2:%.*]] = shufflevector <4 x double> , <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[EXTRACTED1:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[EXTRACTED2:%.*]] = shufflevector <4 x double> , <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[FADD3:%.*]] = fadd <2 x double> [[EXTRACTED1]], [[EXTRACTED2]] ; CHECK-NEXT: ret <2 x double> [[FADD3]] ; @@ -22,6 +22,21 @@ ret <2 x double> %extracted } +define <2 x double> @shuffle_binop_fol_oob(<4 x double>* %ptr) { +; CHECK-LABEL: @shuffle_binop_fol_oob( +; CHECK-NEXT: vector.body.preheader: +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[PTR:%.*]], align 8 +; CHECK-NEXT: [[FADD:%.*]] = fadd <4 x double> [[WIDE_LOAD]], +; CHECK-NEXT: [[EXTRACTED:%.*]] = shufflevector <4 x double> [[FADD]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: ret <2 x double> [[EXTRACTED]] +; +vector.body.preheader: + %wide.load = load <4 x double>, <4 x double>* %ptr, align 8 + %fadd = fadd <4 x double> %wide.load, + %extracted = shufflevector <4 x double> %fadd, <4 x double> undef, <2 x i32> + ret <2 x double> %extracted +} + ; No interleaved load instruction is generated, but the extractelement ; instructions are updated to use the shuffle instead of the load. define void @shuffle_extract(<4 x double>* %ptr, i1 %c) {