diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -239,11 +239,13 @@ // If this instruction may clobber the loads and is in middle of the BCE cmp // block instructions, then bail for now. if (Inst->mayWriteToMemory()) { - // Disallow instructions that might modify the BCE operands - MemoryLocation LLoc = MemoryLocation::get(Cmp.Lhs.LoadI); - MemoryLocation RLoc = MemoryLocation::get(Cmp.Rhs.LoadI); - if (isModSet(AA.getModRefInfo(Inst, LLoc)) || - isModSet(AA.getModRefInfo(Inst, RLoc))) + auto MayClobber = [&](LoadInst *LI) { + // If a potentially clobbering instruction comes before the load, + // we can still safely sink the load. + return !Inst->comesBefore(LI) && + isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI))); + }; + if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI)) return false; } // Make sure this instruction does not use any of the BCE cmp block diff --git a/llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll b/llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll --- a/llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll +++ b/llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll @@ -11,12 +11,12 @@ define i1 @bug(%Triple* nonnull dereferenceable(16) %lhs, %Triple* nonnull dereferenceable(16) %rhs) nofree nosync { ; CHECK-LABEL: @bug( ; CHECK-NEXT: bb0: -; CHECK-NEXT: store i32 1, i32* @g, align 4 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr [[TRIPLE:%.*]], %Triple* [[RHS:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[L0_ADDR:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[LHS:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[L0_ADDR]], align 4 ; CHECK-NEXT: [[R0_ADDR:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[RHS]], i64 0, i32 0 ; CHECK-NEXT: [[R0:%.*]] = load i32, i32* [[R0_ADDR]], align 4 +; CHECK-NEXT: store i32 1, i32* @g, align 4 ; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[L0]], [[R0]] ; CHECK-NEXT: br i1 [[CMP0]], label %"bb1+bb2", label [[FINAL:%.*]] ; CHECK: "bb1+bb2": @@ -32,12 +32,12 @@ ; CHECK-NEXT: ret i1 [[RET]] ; bb0: - store i32 1, i32* @g %gep = getelementptr %Triple, %Triple* %rhs, i64 0, i32 0 %l0_addr = getelementptr inbounds %Triple, %Triple* %lhs, i64 0, i32 0 %l0 = load i32, i32* %l0_addr, align 4 %r0_addr = getelementptr inbounds %Triple, %Triple* %rhs, i64 0, i32 0 %r0 = load i32, i32* %r0_addr, align 4 + store i32 1, i32* @g %cmp0 = icmp eq i32 %l0, %r0 br i1 %cmp0, label %bb1, label %final diff --git a/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll b/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll --- a/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll +++ b/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -141,28 +141,20 @@ ret i1 %8 } -; TODO: The call happen before the loads, so it cannot clobber them. +; The call happens before the loads, so it cannot clobber them. define zeroext i1 @opeq1_call_before_loads( ; X86-LABEL: @opeq1_call_before_loads( -; X86-NEXT: entry: +; X86-NEXT: "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3": ; X86-NEXT: call void (...) @foo() -; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4 -; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4 -; X86-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] -; X86-NEXT: br i1 [[CMP_I]], label %"land.rhs.i+land.rhs.i.2+land.rhs.i.3", label [[OPEQ1_EXIT:%.*]] -; X86: "land.rhs.i+land.rhs.i.2+land.rhs.i.3": -; X86-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 1 -; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 1 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP2]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP3]] to i8* -; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 12) -; X86-NEXT: [[TMP4:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; X86-NEXT: br label [[OPEQ1_EXIT]] +; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* +; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) +; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TMP4]], %"land.rhs.i+land.rhs.i.2+land.rhs.i.3" ] -; X86-NEXT: ret i1 [[TMP5]] +; X86-NEXT: ret i1 [[TMP2]] ; ; Make sure this call is moved to the beginning of the entry block. %S* nocapture readonly dereferenceable(16) %a,