diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6700,6 +6700,7 @@ /// in MVE takes a GPR (integer) register, and the instruction that incorporate /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register. bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { + // Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), m_Undef(), m_ZeroMask()))) return false; @@ -6719,7 +6720,7 @@ Builder.SetInsertPoint(SVI); Value *BC1 = Builder.CreateBitCast( cast(SVI->getOperand(0))->getOperand(1), NewType); - Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1, + Value *Insert = Builder.CreateInsertElement(PoisonValue::get(NewVecType), BC1, (uint64_t)0); Value *Shuffle = Builder.CreateShuffleVector(Insert, SVI->getShuffleMask()); Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -22,8 +22,8 @@ // // E.g. An interleaved load (Factor = 2): // %wide.vec = load <8 x i32>, <8 x i32>* %ptr -// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> -// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> +// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> +// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> // // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2 // intrinsic in ARM backend. @@ -395,12 +395,16 @@ for (auto *SVI : BinOpShuffles) { BinaryOperator *BI = cast(SVI->getOperand(0)); ArrayRef Mask = SVI->getShuffleMask(); + assert(Mask.back() < 0 || + (unsigned)Mask.back() < + cast(BI->getOperand(0)->getType()) + ->getNumElements()); auto *NewSVI1 = new ShuffleVectorInst( - BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask, + BI->getOperand(0), PoisonValue::get(BI->getOperand(0)->getType()), Mask, SVI->getName(), SVI); auto *NewSVI2 = new ShuffleVectorInst( - BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask, + BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask, SVI->getName(), SVI); Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2, BI->getName(), SVI); diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleave-load-extract-shuffle-changes.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @shuffle_binop_fol( ; CHECK-NEXT: vector.body.preheader: ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[PTR:%.*]], align 8 -; CHECK-NEXT: [[EXTRACTED1:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> undef, <2 x i32> -; CHECK-NEXT: [[EXTRACTED2:%.*]] = shufflevector <4 x double> , <4 x double> undef, <2 x i32> +; CHECK-NEXT: [[EXTRACTED1:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[EXTRACTED2:%.*]] = shufflevector <4 x double> , <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[FADD3:%.*]] = fadd <2 x double> [[EXTRACTED1]], [[EXTRACTED2]] ; CHECK-NEXT: ret <2 x double> [[FADD3]] ; @@ -22,6 +22,21 @@ ret <2 x double> %extracted } +define <2 x double> @shuffle_binop_fol_oob(<4 x double>* %ptr) { +; CHECK-LABEL: @shuffle_binop_fol_oob( +; CHECK-NEXT: vector.body.preheader: +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[PTR:%.*]], align 8 +; CHECK-NEXT: [[FADD:%.*]] = fadd <4 x double> [[WIDE_LOAD]], +; CHECK-NEXT: [[EXTRACTED:%.*]] = shufflevector <4 x double> [[FADD]], <4 x double> undef, <2 x i32> +; CHECK-NEXT: ret <2 x double> [[EXTRACTED]] +; +vector.body.preheader: + %wide.load = load <4 x double>, <4 x double>* %ptr, align 8 + %fadd = fadd <4 x double> %wide.load, + %extracted = shufflevector <4 x double> %fadd, <4 x double> undef, <2 x i32> + ret <2 x double> %extracted +} + ; No interleaved load instruction is generated, but the extractelement ; instructions are updated to use the shuffle instead of the load. define void @shuffle_extract(<4 x double>* %ptr, i1 %c) {