Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -837,6 +837,50 @@ return nullptr; } +/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast. +/// +/// \returns underlyingvalue that was "cast", or nullptr otherwise. +static Value* likeBitCastFromVector(InstCombiner &IC, Value* V) { + Value *U = nullptr; + while (auto *IV = dyn_cast(V)) { + auto *EI = dyn_cast(IV->getInsertedValueOperand()); + if (!EI) + return nullptr; + auto *W = EI->getVectorOperand(); + if (!U) { + U = W; + } else if (U != W) + return nullptr; + auto *CI = dyn_cast(EI->getIndexOperand()); + if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin()) + return nullptr; + V = IV->getAggregateOperand(); + } + if (!isa(V) ||!U) + return nullptr; + + VectorType *UT = cast(U->getType()); + Type *VT = V->getType(); + // Check that types UT and VT are bitwise isomorphic. + const DataLayout &DL = IC.getDataLayout(); + if (DL.getTypeSizeInBits(UT) != DL.getTypeSizeInBits(VT)) { + return nullptr; + } + if (ArrayType *AT = dyn_cast(VT)) { + if (AT->getNumElements() != UT->getNumElements()) + return nullptr; + } else { + StructType *ST = cast(VT); + if (ST->getNumElements() != UT->getNumElements()) + return nullptr; + for (const Type *EltT : ST->elements()) { + if (EltT != UT->getElementType()) + return nullptr; + } + } + return U; +} + /// \brief Combine stores to match the type of value being stored. /// /// The core idea here is that the memory does not have any intrinsic type and @@ -872,6 +916,11 @@ return true; } + if (Value *U = likeBitCastFromVector(IC, V)) { + combineStoreToNewValue(IC, SI, U); + return true; + } + // FIXME: We should also canonicalize loads of vectors when their elements are // cast to other types. return false; Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -273,32 +273,93 @@ return Ty; } +/// \brief Check if ArrayType or StructType is isomorphic to some VectorType. +/// +/// \returns number of elements in vector if isomorphism exists, 0 otherwise. +static unsigned canMapToVector(Type *T, const DataLayout &DL) { + unsigned N; + auto *ST = dyn_cast(T); + if (ST) { + N = ST->getNumElements(); + } else { + N = cast(T)->getNumElements(); + } + if (N != 2 && N != 4 && N != 8 && N != 16) + // Reject other vector lengths as generally unprofitable. + return 0; + Type *EltT; + if (ST) { + auto I = ST->element_begin(); + EltT = *I; + // Check that struct is homogeneous + auto E = ST->element_end(); + while (++I != E) + if (*I != EltT) + return 0; + } else { + EltT = cast(T)->getElementType(); + } + if (!VectorType::isValidElementType(EltT)) + return 0; + VectorType *VT = VectorType::get(EltT, N); + if (DL.getTypeSizeInBits(VT) != DL.getTypeSizeInBits(T)) { + return 0; + } + return N; +} + +/// \returns True if Extract{Value,Element} instruction extracts element i. +static bool matchExtractIndex(Instruction *E, unsigned i, unsigned Opcode) { + if (Opcode==Instruction::ExtractElement) { + ConstantInt *CI = dyn_cast(E->getOperand(1)); + return CI && CI->getZExtValue() == i; + } else { + assert(Opcode == Instruction::ExtractValue); + ExtractValueInst *EI = cast(E); + assert(EI->getNumIndices()==1); + return *EI->idx_begin()==i; + } +} + /// \returns True if the ExtractElement instructions in VL can be vectorized /// to use the original vector. -static bool CanReuseExtract(ArrayRef VL) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); +static bool CanReuseExtract(ArrayRef VL, unsigned Opcode) { + assert(Opcode == Instruction::ExtractElement || + Opcode == Instruction::ExtractValue); + assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); + Instruction *E0 = cast(VL0); Value *Vec = E0->getOperand(0); - // We have to extract from the same vector type. - unsigned NElts = Vec->getType()->getVectorNumElements(); + // We have to extract from an vector/aggregate with the same number of elements. + unsigned NElts; + if( Opcode == Instruction::ExtractValue) { + const DataLayout &DL = E0->getModule()->getDataLayout(); + NElts = canMapToVector(Vec->getType(), DL); + if (!NElts) + return false; + // Check for load can be rewritten as load of vector + LoadInst *LI = dyn_cast(Vec); + if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) + return false; + } else { + NElts = Vec->getType()->getVectorNumElements(); + } if (NElts != VL.size()) return false; // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) + if (!matchExtractIndex(E0, 0, Opcode)) return false; for (unsigned i = 1, e = VL.size(); i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); - ConstantInt *CI = dyn_cast(E->getOperand(1)); - - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + Instruction *E = cast(VL[i]); + if (!matchExtractIndex(E, i, Opcode)) + return false; + if (E->getOperand(0) != Vec) return false; } @@ -1145,8 +1206,9 @@ } return; } + case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = CanReuseExtract(VL); + bool Reuse = CanReuseExtract(VL, Opcode); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); } else { @@ -1488,11 +1550,12 @@ case Instruction::PHI: { return 0; } + case Instruction::ExtractValue: case Instruction::ExtractElement: { - if (CanReuseExtract(VL)) { + if (CanReuseExtract(VL, Opcode)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); + Instruction *E = cast(VL[i]); if (E->hasOneUse()) // Take credit for instruction that will become dead. DeadCost += @@ -2232,13 +2295,24 @@ } case Instruction::ExtractElement: { - if (CanReuseExtract(E->Scalars)) { + if (CanReuseExtract(E->Scalars, Instruction::ExtractElement)) { Value *V = VL0->getOperand(0); E->VectorizedValue = V; return V; } return Gather(E->Scalars, VecTy); } + case Instruction::ExtractValue: { + if (CanReuseExtract(E->Scalars, Instruction::ExtractValue)) { + LoadInst *LI = cast(VL0->getOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + E->VectorizedValue = V; + return propagateMetadata(V, E->Scalars); + } + } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -3536,13 +3610,14 @@ for (auto &V : BuildVectorSlice) { IRBuilder Builder( InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); - InsertElementInst *IE = cast(V); + Instruction *I = cast(V); + assert(isa(I) || isa(I)); Instruction *Extract = cast(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); - IE->setOperand(1, Extract); - IE->removeFromParent(); - IE->insertAfter(Extract); - InsertAfter = IE; + I->setOperand(1, Extract); + I->removeFromParent(); + I->insertAfter(Extract); + InsertAfter = I; } } // Move to the next bundle. @@ -3933,6 +4008,25 @@ return false; } +/// \brief Like findBuildVector, but looks backwards for construction of aggregate. +/// +/// \return true if it matches +static bool findBuildAggregate(InsertValueInst *IV, + SmallVectorImpl &BuildVector, + SmallVectorImpl &BuildVectorOpds) { + if (!IV->hasOneUse()) + return false; + Value *V = IV->getAggregateOperand(); + if (!isa(V)) { + InsertValueInst *I = dyn_cast(V); + if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds) ) + return false; + } + BuildVector.push_back(IV); + BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + return true; +} + static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } @@ -4172,6 +4266,29 @@ continue; } + + // Try to vectorize trees that start at insertvalue instructions feeding into + // a store. + if (StoreInst *SI = dyn_cast(it)) { + if (InsertValueInst *LastInsertValue = dyn_cast(SI->getValueOperand())) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (canMapToVector(SI->getValueOperand()->getType(), DL)) { + SmallVector BuildVector; + SmallVector BuildVectorOpds; + if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) { + continue; + } + + DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + } + continue; + } + } + } } return Changed; Index: test/Transforms/InstCombine/insert-val-extract-elem.ll =================================================================== --- test/Transforms/InstCombine/insert-val-extract-elem.ll +++ test/Transforms/InstCombine/insert-val-extract-elem.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -instcombine %s | FileCheck %s + +; CHECK-NOT: insertvalue +; CHECK-NOT: extractelement +; CHECK: store <2 x double> +define void @julia_2xdouble([2 x double]* sret, <2 x double>*) { +top: + %x = load <2 x double>, <2 x double>* %1 + %x0 = extractelement <2 x double> %x, i32 0 + %i0 = insertvalue [2 x double] undef, double %x0, 0 + %x1 = extractelement <2 x double> %x, i32 1 + %i1 = insertvalue [2 x double] %i0, double %x1, 1 + store [2 x double] %i1, [2 x double]* %0, align 4 + ret void +} + +; CHECK-NOT: insertvalue +; CHECK-NOT: extractelement +; CHECK: store <4 x float> +define void @julia_4xfloat([4 x float]* sret, <4 x float>*) { +top: + %x = load <4 x float>, <4 x float>* %1 + %x0 = extractelement <4 x float> %x, i32 0 + %i0 = insertvalue [4 x float] undef, float %x0, 0 + %x1 = extractelement <4 x float> %x, i32 1 + %i1 = insertvalue [4 x float] %i0, float %x1, 1 + %x2 = extractelement <4 x float> %x, i32 2 + %i2 = insertvalue [4 x float] %i1, float %x2, 2 + %x3 = extractelement <4 x float> %x, i32 3 + %i3 = insertvalue [4 x float] %i2, float %x3, 3 + store [4 x float] %i3, [4 x float]* %0, align 4 + ret void +} Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -0,0 +1,128 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +;CHECK: load <2 x double> +;CHECK: load <2 x double> +;CHECK: fmul <2 x double> +;CHECK: fadd <2 x double> +define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) { +top: + %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0 + %x0 = load double, double* %px0, align 4 + %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0 + %y0 = load double, double* %py0, align 4 + %m0 = fmul double %x0, %y0 + %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1 + %x1 = load double, double* %px1, align 4 + %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1 + %y1 = load double, double* %py1, align 4 + %m1 = fmul double %x1, %y1 + %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0 + %z0 = load double, double* %pz0, align 4 + %a0 = fadd double %m0, %z0 + %i0 = insertvalue [2 x double] undef, double %a0, 0 + %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1 + %z1 = load double, double* %pz1, align 4 + %a1 = fadd double %m1, %z1 + %i1 = insertvalue [2 x double] %i0, double %a1, 1 + store [2 x double] %i1, [2 x double]* %0, align 4 + ret void +} + +;CHECK: load <4 x float> +;CHECK: load <4 x float> +;CHECK: fmul <4 x float> +;CHECK: fadd <4 x float> +define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) { +top: + %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0 + %x0 = load float, float* %px0, align 4 + %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0 + %y0 = load float, float* %py0, align 4 + %m0 = fmul float %x0, %y0 + %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1 + %x1 = load float, float* %px1, align 4 + %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1 + %y1 = load float, float* %py1, align 4 + %m1 = fmul float %x1, %y1 + %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2 + %x2 = load float, float* %px2, align 4 + %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2 + %y2 = load float, float* %py2, align 4 + %m2 = fmul float %x2, %y2 + %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3 + %x3 = load float, float* %px3, align 4 + %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3 + %y3 = load float, float* %py3, align 4 + %m3 = fmul float %x3, %y3 + %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0 + %z0 = load float, float* %pz0, align 4 + %a0 = fadd float %m0, %z0 + %i0 = insertvalue [4 x float] undef, float %a0, 0 + %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1 + %z1 = load float, float* %pz1, align 4 + %a1 = fadd float %m1, %z1 + %i1 = insertvalue [4 x float] %i0, float %a1, 1 + %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2 + %z2 = load float, float* %pz2, align 4 + %a2 = fadd float %m2, %z2 + %i2 = insertvalue [4 x float] %i1, float %a2, 2 + %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3 + %z3 = load float, float* %pz3, align 4 + %a3 = fadd float %m3, %z3 + %i3 = insertvalue [4 x float] %i2, float %a3, 3 + store [4 x float] %i3, [4 x float]* %0, align 4 + ret void +} + +;CHECK: fsub <4 x float> +define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) { +top: + %a_arr = load [4 x float], [4 x float]* %a, align 4 + %a0 = extractvalue [4 x float] %a_arr, 0 + %a2 = extractvalue [4 x float] %a_arr, 2 + %a1 = extractvalue [4 x float] %a_arr, 1 + %b_arr = load [4 x float], [4 x float]* %b, align 4 + %b0 = extractvalue [4 x float] %b_arr, 0 + %b2 = extractvalue [4 x float] %b_arr, 2 + %b1 = extractvalue [4 x float] %b_arr, 1 + %a3 = extractvalue [4 x float] %a_arr, 3 + %c1 = fsub float %a1, %b1 + %b3 = extractvalue [4 x float] %b_arr, 3 + %c0 = fsub float %a0, %b0 + %c2 = fsub float %a2, %b2 + %c_arr0 = insertvalue [4 x float] undef, float %c0, 0 + %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1 + %c3 = fsub float %a3, %b3 + %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2 + %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3 + store [4 x float] %c_arr3, [4 x float]* %c, align 4 + ret void +} + +%pseudovec = type { float, float, float, float } + +;CHECK: fsub <4 x float> +define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) { +top: + %a_struct = load %pseudovec, %pseudovec* %a, align 4 + %a0 = extractvalue %pseudovec %a_struct, 0 + %a1 = extractvalue %pseudovec %a_struct, 1 + %b_struct = load %pseudovec, %pseudovec* %b, align 4 + %a2 = extractvalue %pseudovec %a_struct, 2 + %b0 = extractvalue %pseudovec %b_struct, 0 + %a3 = extractvalue %pseudovec %a_struct, 3 + %c0 = fsub float %a0, %b0 + %b1 = extractvalue %pseudovec %b_struct, 1 + %b2 = extractvalue %pseudovec %b_struct, 2 + %c1 = fsub float %a1, %b1 + %c_struct0 = insertvalue %pseudovec undef, float %c0, 0 + %b3 = extractvalue %pseudovec %b_struct, 3 + %c3 = fsub float %a3, %b3 + %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1 + %c2 = fsub float %a2, %b2 + %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2 + %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3 + store %pseudovec %c_struct3, %pseudovec* %c, align 4 + ret void +} +