diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1665,7 +1665,58 @@ case Instruction::FPExt: simplifyAndSetOp(I, 0, DemandedElts, UndefElts); break; + case Instruction::Load: { + // See if we can reduce the number of elements being loaded. + auto *LI = cast(I); + unsigned UsedBegin = DemandedElts.countTrailingZeros(); + unsigned ActiveWidth = DemandedElts.getActiveBits(); + auto *OldTy = cast(LI->getType()); + unsigned OldNumElts = OldTy->getNumElements(); + + // TODO: Optimize leading elements, which needs adjusting pointers and + // alignment. + if (ActiveWidth >= OldNumElts) + return nullptr; + + unsigned NewNumElts = ActiveWidth; + Type *EltTy = OldTy->getElementType(); + Type *NewTy = + (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); + + auto *OldPtr = LI->getPointerOperand(); + Instruction *NewPtr = nullptr; + if (!cast(OldPtr->getType())->isOpaque()) { + NewPtr = CastInst::CreatePointerCast( + OldPtr, PointerType::get(NewTy, LI->getPointerAddressSpace())); + } + + auto *NewLI = new LoadInst(NewTy, NewPtr ? NewPtr : OldPtr, LI->getName(), + LI->isVolatile(), LI->getAlign(), + LI->getOrdering(), LI->getSyncScopeID()); + Instruction *NewI = nullptr; + if (NewNumElts == 1) { + NewI = InsertElementInst::Create( + PoisonValue::get(OldTy), NewLI, + ConstantInt::get(Type::getInt32Ty(EltTy->getContext()), UsedBegin)); + } else { + SmallVector Mask; + for (unsigned I = 0; I < OldNumElts; ++I) { + // TODO: This needs to be adjusted when optimizing leading elements. + if (!!DemandedElts[I]) + Mask.push_back(I); + else + Mask.push_back(NewNumElts); + } + NewI = new ShuffleVectorInst(NewLI, PoisonValue::get(NewTy), Mask); + } + copyMetadataForLoad(*NewLI, *LI); + if (NewPtr) + InsertNewInstWith(NewPtr, *LI); + InsertNewInstWith(NewLI, *LI); + InsertNewInstWith(NewI, *LI); + return NewI; + } case Instruction::Call: { IntrinsicInst *II = dyn_cast(I); if (!II) break; diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-load.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s + +; Make sure the load simplification works for typed pointers. +define amdgpu_kernel void @simplify_load_vector(<2 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +; CHECK-LABEL: @simplify_load_vector( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[IN0:%.*]] to <2 x i32> addrspace(1)* +; CHECK-NEXT: [[A1:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 16 +; CHECK-NEXT: store <2 x i32> [[A1]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in0, align 16 + %r = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + store <2 x i32> %r, <2 x i32> addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll --- a/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll @@ -5,7 +5,7 @@ %B = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> -; CHECK: %D = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: %D = shufflevector <4 x float> {{%.*}}, <4 x float> poison, <4 x i32> zeroinitializer store <4 x float> %D, ptr %out_ptr ret void } diff --git a/llvm/test/Transforms/InstCombine/obfuscated_splat.ll b/llvm/test/Transforms/InstCombine/obfuscated_splat.ll --- a/llvm/test/Transforms/InstCombine/obfuscated_splat.ll +++ b/llvm/test/Transforms/InstCombine/obfuscated_splat.ll @@ -5,7 +5,7 @@ %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> -; CHECK: %D = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: %D = shufflevector <4 x float> {{%.*}}, <4 x float> poison, <4 x i32> zeroinitializer store <4 x float> %D, ptr %out_ptr ret void } diff --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll --- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll @@ -4,8 +4,8 @@ define i32 @extract_load(ptr %p) { ; ; CHECK-LABEL: @extract_load( -; CHECK-NEXT: [[X:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[X]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: ret i32 [[EXT]] ; %x = load <4 x i32>, ptr %p, align 4 @@ -28,8 +28,9 @@ define double @extract_load_volatile(ptr %p) { ; ; CHECK-LABEL: @extract_load_volatile( -; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, ptr [[P:%.*]], align 32 -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile <3 x double>, ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, ptr [[P]], align 32 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <3 x double> [[TMP1]], i64 2 ; CHECK-NEXT: ret double [[EXT]] ; %x = load volatile <4 x double>, ptr %p @@ -183,10 +184,10 @@ define float @extract_element_load(<4 x float> %x, ptr %ptr) { ; ; CHECK-LABEL: @extract_element_load( -; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 -; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[PTR:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[R]] ; %load = load <4 x float>, ptr %ptr diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll --- a/llvm/test/Transforms/InstCombine/scalarization.ll +++ b/llvm/test/Transforms/InstCombine/scalarization.ll @@ -4,8 +4,8 @@ define i32 @extract_load(ptr %p) { ; ; CHECK-LABEL: @extract_load( -; CHECK-NEXT: [[X:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[X]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: ret i32 [[EXT]] ; %x = load <4 x i32>, ptr %p, align 4 @@ -28,8 +28,9 @@ define double @extract_load_volatile(ptr %p) { ; ; CHECK-LABEL: @extract_load_volatile( -; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, ptr [[P:%.*]], align 32 -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile <3 x double>, ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, ptr [[P]], align 32 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <3 x double> [[TMP1]], i64 2 ; CHECK-NEXT: ret double [[EXT]] ; %x = load volatile <4 x double>, ptr %p @@ -183,10 +184,10 @@ define float @extract_element_load(<4 x float> %x, ptr %ptr) { ; ; CHECK-LABEL: @extract_element_load( -; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 -; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[PTR:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[R]] ; %load = load <4 x float>, ptr %ptr diff --git a/llvm/test/Transforms/InstCombine/shuffle-binop.ll b/llvm/test/Transforms/InstCombine/shuffle-binop.ll --- a/llvm/test/Transforms/InstCombine/shuffle-binop.ll +++ b/llvm/test/Transforms/InstCombine/shuffle-binop.ll @@ -188,10 +188,12 @@ define <2 x double> @shuffle_op2_0th_element_mask(ptr %a, ptr %b) { ;%0 = load <2 x double>, ptr @d, align 16 ; CHECK-LABEL: @shuffle_op2_0th_element_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[B:%.*]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: ret <2 x double> [[SHUFFLE]] ; %1 = load <2 x double>, ptr %a, align 16 diff --git a/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll b/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll --- a/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll +++ b/llvm/test/Transforms/InstCombine/sink-into-catchswitch.ll @@ -9,8 +9,7 @@ define void @test1(ptr %p) personality ptr @__CxxFrameHandler3 { ; CHECK-LABEL: @test1( ; CHECK-NEXT: invoke.cont: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 8 ; CHECK-NEXT: invoke void @throw() ; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: catch.dispatch: @@ -20,7 +19,7 @@ ; CHECK-NEXT: invoke void @throw() [ "funclet"(token [[CATCH]]) ] ; CHECK-NEXT: to label [[UNREACHABLE]] unwind label [[EHCLEANUP]] ; CHECK: ehcleanup: -; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[TMP0]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ] ; CHECK-NEXT: [[CLEANUP:%.*]] = cleanuppad within none [] ; CHECK-NEXT: call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ] ; CHECK-NEXT: cleanupret from [[CLEANUP]] unwind to caller