Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -33,6 +34,7 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -65,6 +67,7 @@ const TargetTransformInfo &TTI; const DominatorTree &DT; + bool vectorizeLoad(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -88,6 +91,58 @@ New.takeName(&Old); } +bool VectorCombine::vectorizeLoad(Instruction &I) { + // Match regular scalar loads. + auto *Load = dyn_cast(&I); + Type *LoadTy = I.getType(); + if (!Load || !Load->isSimple() || isa(LoadTy)) + return false; + + // TODO: Extend this to match GEP with constant offsets. + Value *PtrOp = Load->getPointerOperand()->stripPointerCasts(); + auto *PtrOpTy = dyn_cast(PtrOp->getType()); + if (!PtrOpTy) + return false; + + unsigned VectorSize = TTI.getMinVectorRegisterBitWidth(); + uint64_t ScalarSize = LoadTy->getPrimitiveSizeInBits(); + if (!ScalarSize || VectorSize % ScalarSize != 0) + return false; + + + // Check safety of replacing the scalar load with a larger vector load. + auto *VectorTy = VectorType::get(LoadTy, VectorSize / ScalarSize, false); + Align Alignment = Load->getAlign(); + const DataLayout &DL = I.getModule()->getDataLayout(); + if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT)) + return false; + + // Original pattern: load [free casts of] ScalarPtr + int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + Load->getPointerAddressSpace()); + + // New pattern: extractelt (load VecPtr), 0 + int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, + Load->getPointerAddressSpace()); + NewCost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 0); + + // We can aggressively convert to the vector form because the backend will + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost) + return false; + + // It is safe and potentially profitable to load a vector and extract the + // scalar value from that: + // load (bitcast VecPtr to ScalarPtr) --> extractelt (load VecPtr), 0 + IRBuilder<> Builder(Load); + Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo()); + LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); + Value *ExtElt = Builder.CreateExtractElement(VecLd, Builder.getInt32(0)); + replaceValue(I, *ExtElt); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -625,6 +680,7 @@ if (isa(I)) continue; Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoad(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); Index: llvm/test/Transforms/VectorCombine/X86/load.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/load.ll +++ llvm/test/Transforms/VectorCombine/X86/load.ll @@ -6,13 +6,17 @@ define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @matching_fp_scalar( -; CHECK-NEXT: [[R:%.*]] = load float, float* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %r = load float, float* %p, align 16 ret float %r } +; Negative test - do not alter volatile. + define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @matching_fp_scalar_volatile( ; CHECK-NEXT: [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16 @@ -24,8 +28,9 @@ define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @larger_fp_scalar( -; CHECK-NEXT: [[BC:%.*]] = bitcast float* [[P:%.*]] to double* -; CHECK-NEXT: [[R:%.*]] = load double, double* [[BC]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[R]] ; %bc = bitcast float* %p to double* @@ -35,8 +40,9 @@ define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @smaller_fp_scalar( -; CHECK-NEXT: [[BC:%.*]] = bitcast double* [[P:%.*]] to float* -; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %bc = bitcast double* %p to float* @@ -46,8 +52,8 @@ define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @matching_fp_vector( -; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float* -; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %bc = bitcast <4 x float>* %p to float* @@ -55,10 +61,12 @@ ret float %r } +; This is canonical form for load of vector element 0. + define float @matching_fp_vector_gep0(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @matching_fp_vector_gep0( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[R:%.*]] = load float, float* [[GEP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -66,10 +74,13 @@ ret float %r } +; Pointer types do not affect the transform - only the loaded type is considered for extract cost. + define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @nonmatching_int_vector( -; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float* -; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %bc = bitcast <2 x i64>* %p to float* @@ -79,13 +90,17 @@ define double @less_aligned(double* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @less_aligned( -; CHECK-NEXT: [[R:%.*]] = load double, double* [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[R]] ; %r = load double, double* %p, align 4 ret double %r } +; Negative test - not enough dereferenceable bytes. + define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) { ; CHECK-LABEL: @matching_fp_scalar_small_deref( ; CHECK-NEXT: [[R:%.*]] = load float, float* [[P:%.*]], align 16 @@ -95,6 +110,8 @@ ret float %r } +; Negative test - extract from vector to int is not free. + define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @larger_int_scalar( ; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64* @@ -106,6 +123,8 @@ ret i64 %r } +; Negative test - extract from vector to int is not free. + define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @smaller_int_scalar( ; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8* @@ -117,10 +136,13 @@ ret i8 %r } +; We could load a 32-byte vector, but we favor the minimum vector size. + define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @larger_fp_scalar_256bit_vec( -; CHECK-NEXT: [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double* -; CHECK-NEXT: [[R:%.*]] = load double, double* [[BC]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[R]] ; %bc = bitcast <8 x float>* %p to double*