diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -83,6 +83,7 @@ bool foldBitcastShuf(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); bool foldExtractedCmps(Instruction &I); + bool scalarizeLoadExtract(Instruction &I); }; } // namespace @@ -754,6 +755,83 @@ return true; } +/// Try to scalarize vector loads feeding extractelement instructions. +bool VectorCombine::scalarizeLoadExtract(Instruction &I) { + auto *EI = dyn_cast(&I); + if (!EI) + return false; + + auto *LI = dyn_cast(EI->getOperand(0)); + const DataLayout &DL = I.getModule()->getDataLayout(); + if (!LI || LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType())) + return false; + + auto *FixedVT = dyn_cast(LI->getType()); + if (!FixedVT) + return false; + + InstructionCost OriginalCost = TTI.getMemoryOpCost( + Instruction::Load, LI->getType(), Align(LI->getAlignment()), + LI->getPointerAddressSpace()); + InstructionCost ScalarizedCost = 0; + + Instruction *LastCheckedInst = LI; + unsigned NumInstChecked = 0; + // Check if all users of the load are extracts with no memory modifications + // between the load and the extract. Compute the cost of both the original + // code and the scalarized version. + for (User *U : LI->users()) { + auto *UI = dyn_cast(U); + if (!UI || UI->getParent() != LI->getParent()) + return false; + + // Check if any instruction between the load and the extract may modify + // memory. + if (LastCheckedInst->comesBefore(UI)) { + for (Instruction &I : + make_range(std::next(LI->getIterator()), UI->getIterator())) { + // Bail out if we reached the check limit or the instruction may write + // to memory. + if (NumInstChecked == 6 || I.mayWriteToMemory()) + return false; + NumInstChecked++; + } + } + + if (!LastCheckedInst) + LastCheckedInst = UI; + else if (LastCheckedInst->comesBefore(UI)) + LastCheckedInst = UI; + + auto *Index = dyn_cast(UI->getOperand(1)); + OriginalCost += + TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(), + Index ? Index->getZExtValue() : -1); + ScalarizedCost += + TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), + Align(1), LI->getPointerAddressSpace()); + ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType()); + } + + if (ScalarizedCost >= OriginalCost) + return false; + + // Replace extracts with narrow scalar loads. + for (User *U : LI->users()) { + auto *EI = cast(U); + IRBuilder<>::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(EI); + Value *GEP = Builder.CreateInBoundsGEP( + FixedVT, LI->getOperand(0), {Builder.getInt32(0), EI->getOperand(1)}); + auto *NewLoad = cast(Builder.CreateLoad( + FixedVT->getElementType(), GEP, EI->getName() + ".scalar")); + NewLoad->setAlignment(Align(1)); + replaceValue(*EI, *NewLoad); + } + + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -782,6 +860,7 @@ MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= foldExtractedCmps(I); + MadeChange |= scalarizeLoadExtract(I); } } diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll @@ -3,8 +3,8 @@ define i32 @load_extract_idx_0(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_idx_0( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -14,8 +14,8 @@ define i32 @load_extract_idx_1(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_idx_1( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 1 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -25,8 +25,8 @@ define i32 @load_extract_idx_2(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_idx_2( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -36,8 +36,8 @@ define i32 @load_extract_idx_3(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_idx_3( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 3 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -47,8 +47,8 @@ define i32 @load_extract_idx_var_i64(<4 x i32>* %x, i64 %idx) { ; CHECK-LABEL: @load_extract_idx_var_i64( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX:%.*]] +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -58,8 +58,8 @@ define i32 @load_extract_idx_var_i32(<4 x i32>* %x, i32 %idx) { ; CHECK-LABEL: @load_extract_idx_var_i32( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 [[IDX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; %lv = load <4 x i32>, <4 x i32>* %x @@ -72,8 +72,8 @@ define i32 @load_extract_clobber_call_before(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_clobber_call_before( ; CHECK-NEXT: call void @clobber() -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; call void @clobber() @@ -97,8 +97,8 @@ define i32 @load_extract_clobber_call_after(<4 x i32>* %x) { ; CHECK-LABEL: @load_extract_clobber_call_after( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: call void @clobber() ; CHECK-NEXT: ret i32 [[R]] ; @@ -111,8 +111,8 @@ define i32 @load_extract_clobber_store_before(<4 x i32>* %x, i8* %y) { ; CHECK-LABEL: @load_extract_clobber_store_before( ; CHECK-NEXT: store i8 0, i8* [[Y:%.*]], align 1 -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i32 2 +; CHECK-NEXT: [[R:%.*]] = load i32, i32* [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[R]] ; store i8 0, i8* %y @@ -255,9 +255,10 @@ ; Scalarizing may or may not be profitable, depending on the target. define i32 @load_multiple_2_with_variable_indices(<4 x i32>* %x, i64 %idx.0, i64 %idx.1) { ; CHECK-LABEL: @load_multiple_2_with_variable_indices( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_0:%.*]] -; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]] +; CHECK-NEXT: [[E_0:%.*]] = load i32, i32* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]] +; CHECK-NEXT: [[E_1:%.*]] = load i32, i32* [[TMP2]], align 1 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -270,11 +271,14 @@ define i32 @load_4_extracts_with_variable_indices_short_vector(<4 x i32>* %x, i64 %idx.0, i64 %idx.1, i64 %idx.2, i64 %idx.3) { ; CHECK-LABEL: @load_4_extracts_with_variable_indices_short_vector( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, <4 x i32>* [[X:%.*]], align 16 -; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_0:%.*]] -; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_1:%.*]] -; CHECK-NEXT: [[E_2:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_2:%.*]] -; CHECK-NEXT: [[E_3:%.*]] = extractelement <4 x i32> [[LV]], i64 [[IDX_3:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]] +; CHECK-NEXT: [[E_0:%.*]] = load i32, i32* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]] +; CHECK-NEXT: [[E_1:%.*]] = load i32, i32* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_2:%.*]] +; CHECK-NEXT: [[E_2:%.*]] = load i32, i32* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[X]], i32 0, i64 [[IDX_3:%.*]] +; CHECK-NEXT: [[E_3:%.*]] = load i32, i32* [[TMP4]], align 1 ; CHECK-NEXT: [[RES_0:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: [[RES_1:%.*]] = add i32 [[RES_0]], [[E_2]] ; CHECK-NEXT: [[RES_2:%.*]] = add i32 [[RES_1]], [[E_3]] @@ -293,9 +297,10 @@ define i32 @load_multiple_extracts_with_variable_indices_large_vector(<16 x i32>* %x, i64 %idx.0, i64 %idx.1) { ; CHECK-LABEL: @load_multiple_extracts_with_variable_indices_large_vector( -; CHECK-NEXT: [[LV:%.*]] = load <16 x i32>, <16 x i32>* [[X:%.*]], align 64 -; CHECK-NEXT: [[E_0:%.*]] = extractelement <16 x i32> [[LV]], i64 [[IDX_0:%.*]] -; CHECK-NEXT: [[E_1:%.*]] = extractelement <16 x i32> [[LV]], i64 [[IDX_1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[X:%.*]], i32 0, i64 [[IDX_0:%.*]] +; CHECK-NEXT: [[E_0:%.*]] = load i32, i32* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[X]], i32 0, i64 [[IDX_1:%.*]] +; CHECK-NEXT: [[E_1:%.*]] = load i32, i32* [[TMP2]], align 1 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -630,14 +630,14 @@ define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) { ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; SSE2-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8 -; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 +; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0 +; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[TMP1]], align 1 ; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 ; SSE2-NEXT: ret <8 x i16> [[R]] ; ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( ; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> ; AVX2-NEXT: ret <8 x i16> [[R]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -630,14 +630,14 @@ define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) { ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; SSE2-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8 -; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 +; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0 +; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[TMP1]], align 1 ; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 ; SSE2-NEXT: ret <8 x i16> [[R]] ; ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( ; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> ; AVX2-NEXT: ret <8 x i16> [[R]] ;