diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -766,7 +766,8 @@ // pattern which suggests that the loads can be combined. The one and only use // of the loads is to form a wider load. static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, - TargetTransformInfo &TTI, AliasAnalysis &AA) { + TargetTransformInfo &TTI, AliasAnalysis &AA, + const DominatorTree &DT) { // Only consider load chains of scalar values. if (isa(I.getType())) return false; @@ -791,15 +792,17 @@ if (!Allowed || !Fast) return false; - // Make sure the Load pointer of type GEP/non-GEP is above insert point - Instruction *Inst = dyn_cast(LI1->getPointerOperand()); - if (Inst && Inst->getParent() == LI1->getParent() && - !Inst->comesBefore(LOps.RootInsert)) - Inst->moveBefore(LOps.RootInsert); - - // New load can be generated + // Get the Index and Ptr for the new GEP. Value *Load1Ptr = LI1->getPointerOperand(); Builder.SetInsertPoint(LOps.RootInsert); + if (!DT.dominates(Load1Ptr, LOps.RootInsert)) { + APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0); + Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets( + DL, Offset1, /* AllowNonInbounds */ true); + Load1Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Load1Ptr, + Builder.getInt32(Offset1.getZExtValue())); + } + // Generate wider load. Value *NewPtr = Builder.CreateBitCast(Load1Ptr, WiderType->getPointerTo(AS)); NewLoad = Builder.CreateAlignedLoad(WiderType, NewPtr, LI1->getAlign(), LI1->isVolatile(), ""); @@ -936,7 +939,7 @@ MadeChange |= tryToRecognizePopCount(I); MadeChange |= tryToFPToSat(I, TTI); MadeChange |= tryToRecognizeTableBasedCttz(I); - MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA); + MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll --- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll @@ -1869,8 +1869,8 @@ define i32 @loadCombine_4consecutive_badinsert3(ptr %p) { ; LE-LABEL: @loadCombine_4consecutive_badinsert3( -; LE-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1 -; LE-NEXT: [[L1:%.*]] = load i32, ptr [[P1]], align 1 +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1 +; LE-NEXT: [[L1:%.*]] = load i32, ptr [[TMP1]], align 1 ; LE-NEXT: ret i32 [[L1]] ; ; BE-LABEL: @loadCombine_4consecutive_badinsert3( @@ -2085,3 +2085,82 @@ %o3 = or i32 %o2, %e1 ret i32 %o3 } + +define void @nested_gep(ptr %p, ptr %dest) { +; LE-LABEL: @nested_gep( +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68 +; LE-NEXT: [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4 +; LE-NEXT: [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32 +; LE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @nested_gep( +; BE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72 +; BE-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; BE-NEXT: [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64 +; BE-NEXT: [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32 +; BE-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64 +; BE-NEXT: [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4 +; BE-NEXT: [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4 +; BE-NEXT: [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64 +; BE-NEXT: [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]] +; BE-NEXT: [[ADD:%.*]] = add i64 [[OR]], 0 +; BE-NEXT: [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32 +; BE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; BE-NEXT: ret void +; + %gep1 = getelementptr inbounds i8, ptr %p, i64 72 + %ld1 = load i32, ptr %gep1, align 4 + %ld1_zext = zext i32 %ld1 to i64 + %ld1_shl = shl nuw i64 %ld1_zext, 32 + %gep2 = getelementptr inbounds i8, ptr %p, i64 64 + ; Don't move final_ptr before gep2 + %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4 + %ld2 = load i32, ptr %final_ptr, align 4 + %ld2_zext = zext i32 %ld2 to i64 + %or = or i64 %ld1_shl, %ld2_zext + %add = add i64 %or, 0 + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr %dest, align 4 + ret void +} + + +define void @bitcast_gep(ptr %p, ptr %dest) { +; LE-LABEL: @bitcast_gep( +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68 +; LE-NEXT: [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4 +; LE-NEXT: [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32 +; LE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @bitcast_gep( +; BE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72 +; BE-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; BE-NEXT: [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64 +; BE-NEXT: [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32 +; BE-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68 +; BE-NEXT: [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr +; BE-NEXT: [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4 +; BE-NEXT: [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64 +; BE-NEXT: [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]] +; BE-NEXT: [[ADD:%.*]] = add i64 [[OR]], 0 +; BE-NEXT: [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32 +; BE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; BE-NEXT: ret void +; + %gep1 = getelementptr inbounds i8, ptr %p, i64 72 + %ld1 = load i32, ptr %gep1, align 4 + %ld1_zext = zext i32 %ld1 to i64 + %ld1_shl = shl nuw i64 %ld1_zext, 32 + %gep2 = getelementptr inbounds i8, ptr %p, i64 68 + ; Don't move final_ptr before gep2 + %final_ptr = bitcast ptr %gep2 to ptr + %ld2 = load i32, ptr %final_ptr, align 4 + %ld2_zext = zext i32 %ld2 to i64 + %or = or i64 %ld1_shl, %ld2_zext + %add = add i64 %or, 0 + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr %dest, align 4 + ret void +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll --- a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll @@ -2005,8 +2005,8 @@ define i32 @loadCombine_4consecutive_badinsert3(ptr %p) { ; LE-LABEL: @loadCombine_4consecutive_badinsert3( -; LE-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1 -; LE-NEXT: [[L1:%.*]] = load i32, ptr [[P1]], align 1 +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1 +; LE-NEXT: [[L1:%.*]] = load i32, ptr [[TMP1]], align 1 ; LE-NEXT: ret i32 [[L1]] ; ; BE-LABEL: @loadCombine_4consecutive_badinsert3( @@ -2303,3 +2303,82 @@ %o7 = or i64 %s1, %s0 ret i64 %o7 } + +define void @nested_gep(ptr %p, ptr %dest) { +; LE-LABEL: @nested_gep( +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68 +; LE-NEXT: [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4 +; LE-NEXT: [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32 +; LE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @nested_gep( +; BE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72 +; BE-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; BE-NEXT: [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64 +; BE-NEXT: [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32 +; BE-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64 +; BE-NEXT: [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4 +; BE-NEXT: [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4 +; BE-NEXT: [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64 +; BE-NEXT: [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]] +; BE-NEXT: [[ADD:%.*]] = add i64 [[OR]], 0 +; BE-NEXT: [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32 +; BE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; BE-NEXT: ret void +; + %gep1 = getelementptr inbounds i8, ptr %p, i64 72 + %ld1 = load i32, ptr %gep1, align 4 + %ld1_zext = zext i32 %ld1 to i64 + %ld1_shl = shl nuw i64 %ld1_zext, 32 + %gep2 = getelementptr inbounds i8, ptr %p, i64 64 + ; Don't move final_ptr before gep2 + %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4 + %ld2 = load i32, ptr %final_ptr, align 4 + %ld2_zext = zext i32 %ld2 to i64 + %or = or i64 %ld1_shl, %ld2_zext + %add = add i64 %or, 0 + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr %dest, align 4 + ret void +} + + +define void @bitcast_gep(ptr %p, ptr %dest) { +; LE-LABEL: @bitcast_gep( +; LE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68 +; LE-NEXT: [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4 +; LE-NEXT: [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32 +; LE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @bitcast_gep( +; BE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72 +; BE-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; BE-NEXT: [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64 +; BE-NEXT: [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32 +; BE-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68 +; BE-NEXT: [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr +; BE-NEXT: [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4 +; BE-NEXT: [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64 +; BE-NEXT: [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]] +; BE-NEXT: [[ADD:%.*]] = add i64 [[OR]], 0 +; BE-NEXT: [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32 +; BE-NEXT: store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4 +; BE-NEXT: ret void +; + %gep1 = getelementptr inbounds i8, ptr %p, i64 72 + %ld1 = load i32, ptr %gep1, align 4 + %ld1_zext = zext i32 %ld1 to i64 + %ld1_shl = shl nuw i64 %ld1_zext, 32 + %gep2 = getelementptr inbounds i8, ptr %p, i64 68 + ; Don't move final_ptr before gep2 + %final_ptr = bitcast ptr %gep2 to ptr + %ld2 = load i32, ptr %final_ptr, align 4 + %ld2_zext = zext i32 %ld2 to i64 + %or = or i64 %ld1_shl, %ld2_zext + %add = add i64 %or, 0 + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr %dest, align 4 + ret void +}