diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PatternMatch.h" @@ -1601,9 +1602,28 @@ if (!OtherBr || BBI == OtherBB->begin()) return false; + auto OtherStoreIsMergeable = + [&](StoreInst *OtherStore) -> std::pair { + if (!OtherStore || + OtherStore->getPointerOperand() != SI.getPointerOperand()) + return {nullptr, false}; + if (SI.isSameOperationAs(OtherStore)) + return {OtherStore, false}; + + auto *SIVTy = SI.getValueOperand()->getType(); + auto *OSVTy = OtherStore->getValueOperand()->getType(); + if (!SIVTy->isSingleValueType() || SIVTy->isVectorTy() || + !OSVTy->isSingleValueType() || OSVTy->isVectorTy() || + !CastInst::isBitOrNoopPointerCastable(OSVTy, SIVTy, DL) || + !SI.hasSameSpecialState(OtherStore)) + return {nullptr, false}; + return {OtherStore, true}; + }; + // If the other block ends in an unconditional branch, check for the 'if then // else' case. There is an instruction before the branch. StoreInst *OtherStore = nullptr; + bool InsertBitcast = false; if (OtherBr->isUnconditional()) { --BBI; // Skip over debugging info and pseudo probes. @@ -1615,10 +1635,13 @@ } // If this isn't a store, isn't a store to the same location, or is not the // right kind of store, bail out. - OtherStore = dyn_cast(BBI); - if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) || - !SI.isSameOperationAs(OtherStore)) - return false; + if (auto *S = dyn_cast(BBI)) { + auto [OtherSI, NeedBitcast] = OtherStoreIsMergeable(S); + if (!OtherSI) + return false; + OtherStore = OtherSI; + InsertBitcast = NeedBitcast; + } } else { // Otherwise, the other block ended with a conditional branch. If one of the // destinations is StoreBB, then we have the if/then case. @@ -1631,12 +1654,14 @@ // lives in OtherBB. for (;; --BBI) { // Check to see if we find the matching store. - if ((OtherStore = dyn_cast(BBI))) { - if (OtherStore->getOperand(1) != SI.getOperand(1) || - !SI.isSameOperationAs(OtherStore)) + if (auto *S = dyn_cast(BBI)) { + auto [OtherSI, NeedBitcast] = OtherStoreIsMergeable(S); + if (!OtherSI) return false; - break; + OtherStore = OtherSI; + InsertBitcast = NeedBitcast; } + // If we find something that may be using or overwriting the stored // value, or if we run out of instructions, we can't do the transform. if (BBI->mayReadFromMemory() || BBI->mayThrow() || @@ -1653,6 +1678,16 @@ } } + if (InsertBitcast) { + auto *NewVal = BitCastInst::CreateBitOrPointerCast( + OtherStore->getValueOperand(), SI.getValueOperand()->getType(), "", + OtherStore); + auto *NewOtherStore = new StoreInst(NewVal, OtherStore->getPointerOperand(), + OtherStore->getParent()); + eraseInstFromFunction(*OtherStore); + OtherStore = NewOtherStore; + } + // Insert a PHI node now if we need it. Value *MergedVal = OtherStore->getOperand(0); // The debug locations of the original instructions might differ. Merge them. diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/merging-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/AMDGPU/merging-stores-into-successor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/merging-stores-into-successor.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes=instcombine -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target triple = "amdgcn-amd-amdhsa" + +define ptr @inttoptr_merge(i1 %cond, i64 %a, ptr %b) { +; CHECK-LABEL: define ptr @inttoptr_merge +; CHECK-SAME: (i1 [[COND:%.*]], i64 [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: BB0: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[A]] to ptr +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: BB1: +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi ptr [ [[B]], [[BB1]] ], [ [[TMP0]], [[BB0]] ] +; CHECK-NEXT: ret ptr [[STOREMERGE]] +; +entry: + %alloca = alloca ptr + br i1 %cond, label %BB0, label %BB1 +BB0: + store i64 %a, ptr %alloca + br label %sink +BB1: + store ptr %b, ptr %alloca + br label %sink +sink: + %val = load ptr, ptr %alloca + ret ptr %val +} diff --git a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll --- a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll +++ b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll @@ -71,3 +71,205 @@ bb12: ; preds = %bb10, %bb9 ret void } + +define half @diff_types_same_width_merge(i1 %cond, half %a, i16 %b) { +; CHECK-LABEL: @diff_types_same_width_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: BB0: +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: BB1: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16 [[B:%.*]] to half +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi half [ [[TMP0]], [[BB1]] ], [ [[A:%.*]], [[BB0]] ] +; CHECK-NEXT: ret half [[STOREMERGE]] +; +entry: + %alloca = alloca half + br i1 %cond, label %BB0, label %BB1 +BB0: + store half %a, ptr %alloca + br label %sink +BB1: + store i16 %b, ptr %alloca + br label %sink +sink: + %val = load half, ptr %alloca + ret half %val +} + +define i32 @diff_types_diff_width_no_merge(i1 %cond, i32 %a, i64 %b) { +; CHECK-LABEL: @diff_types_diff_width_no_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i64, align 8 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: store i32 [[A:%.*]], ptr [[ALLOCA]], align 8 +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: B: +; CHECK-NEXT: store i64 [[B:%.*]], ptr [[ALLOCA]], align 8 +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 8 +; CHECK-NEXT: ret i32 [[VAL]] +; +entry: + %alloca = alloca i64 + br i1 %cond, label %A, label %B +A: + store i32 %a, ptr %alloca + br label %sink +B: + store i64 %b, ptr %alloca + br label %sink +sink: + %val = load i32, ptr %alloca + ret i32 %val +} + +define <4 x i32> @vec_no_merge(i1 %cond, <2 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @vec_no_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i64, align 16 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: store <2 x i32> [[A:%.*]], ptr [[ALLOCA]], align 16 +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: B: +; CHECK-NEXT: store <4 x i32> [[B:%.*]], ptr [[ALLOCA]], align 16 +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[VAL:%.*]] = load <4 x i32>, ptr [[ALLOCA]], align 16 +; CHECK-NEXT: ret <4 x i32> [[VAL]] +; +entry: + %alloca = alloca i64 + br i1 %cond, label %A, label %B +A: + store <2 x i32> %a, ptr %alloca + br label %sink +B: + store <4 x i32> %b, ptr %alloca + br label %sink +sink: + %val = load <4 x i32>, ptr %alloca + ret <4 x i32> %val +} + +%struct.half = type { half }; + +define %struct.half @one_elem_struct_merge(i1 %cond, %struct.half %a, half %b) { +; CHECK-LABEL: @one_elem_struct_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: BB0: +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_HALF:%.*]] [[A:%.*]], 0 +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: BB1: +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi half [ [[TMP0]], [[BB0]] ], [ [[B:%.*]], [[BB1]] ] +; CHECK-NEXT: [[VAL1:%.*]] = insertvalue [[STRUCT_HALF]] poison, half [[STOREMERGE]], 0 +; CHECK-NEXT: ret [[STRUCT_HALF]] [[VAL1]] +; +entry: + %alloca = alloca i64 + br i1 %cond, label %BB0, label %BB1 +BB0: + store %struct.half %a, ptr %alloca + br label %sink +BB1: + store half %b, ptr %alloca + br label %sink +sink: + %val = load %struct.half, ptr %alloca + ret %struct.half %val +} + +%struct.tup = type { half, i32 }; + +define %struct.tup @multi_elem_struct_no_merge(i1 %cond, %struct.tup %a, half %b) { +; CHECK-LABEL: @multi_elem_struct_no_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i64, align 8 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: store [[STRUCT_TUP:%.*]] [[A:%.*]], ptr [[ALLOCA]], align 8 +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: B: +; CHECK-NEXT: store half [[B:%.*]], ptr [[ALLOCA]], align 8 +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[VAL:%.*]] = load [[STRUCT_TUP]], ptr [[ALLOCA]], align 8 +; CHECK-NEXT: ret [[STRUCT_TUP]] [[VAL]] +; +entry: + %alloca = alloca i64 + br i1 %cond, label %A, label %B +A: + store %struct.tup %a, ptr %alloca + br label %sink +B: + store half %b, ptr %alloca + br label %sink +sink: + %val = load %struct.tup, ptr %alloca + ret %struct.tup %val +} + +define i16 @same_types_diff_align_no_merge(i1 %cond, i16 %a, i16 %b) { +; CHECK-LABEL: @same_types_diff_align_no_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i16, align 2 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: BB0: +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[ALLOCA]], align 2 +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: BB1: +; CHECK-NEXT: store i16 [[B:%.*]], ptr [[ALLOCA]], align 4 +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[ALLOCA]], align 2 +; CHECK-NEXT: ret i16 [[VAL]] +; +entry: + %alloca = alloca i16 + br i1 %cond, label %BB0, label %BB1 +BB0: + store i16 %a, ptr %alloca, align 2 + br label %sink +BB1: + store i16 %b, ptr %alloca, align 4 + br label %sink +sink: + %val = load i16, ptr %alloca + ret i16 %val +} + +define i64 @ptrtoint_merge(i1 %cond, i64 %a, ptr %b) { +; CHECK-LABEL: @ptrtoint_merge( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] +; CHECK: BB0: +; CHECK-NEXT: br label [[SINK:%.*]] +; CHECK: BB1: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: br label [[SINK]] +; CHECK: sink: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i64 [ [[A:%.*]], [[BB0]] ], [ [[TMP0]], [[BB1]] ] +; CHECK-NEXT: ret i64 [[STOREMERGE]] +; +entry: + %alloca = alloca ptr + br i1 %cond, label %BB0, label %BB1 +BB0: + store i64 %a, ptr %alloca + br label %sink +BB1: + store ptr %b, ptr %alloca + br label %sink +sink: + %val = load i64, ptr %alloca + ret i64 %val +} diff --git a/llvm/test/Transforms/InstCombine/store.ll b/llvm/test/Transforms/InstCombine/store.ll --- a/llvm/test/Transforms/InstCombine/store.ll +++ b/llvm/test/Transforms/InstCombine/store.ll @@ -115,12 +115,15 @@ ; "if then" define i32 @test4(i1 %C) { ; CHECK-LABEL: @test4( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 47, ptr [[A]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[COND:%.*]], label [[CONT:%.*]] ; CHECK: Cond: +; CHECK-NEXT: store i32 -987654321, ptr [[A]], align 4 ; CHECK-NEXT: br label [[CONT]] ; CHECK: Cont: -; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ -987654321, [[COND]] ], [ 47, [[TMP0:%.*]] ] -; CHECK-NEXT: ret i32 [[STOREMERGE]] +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: ret i32 [[V]] ; %A = alloca i32 store i32 47, ptr %A @@ -138,12 +141,12 @@ ; "if then" define void @test5(i1 %C, ptr %P) { ; CHECK-LABEL: @test5( +; CHECK-NEXT: store i32 47, ptr [[P:%.*]], align 1 ; CHECK-NEXT: br i1 [[C:%.*]], label [[COND:%.*]], label [[CONT:%.*]] ; CHECK: Cond: +; CHECK-NEXT: store i32 -987654321, ptr [[P]], align 1 ; CHECK-NEXT: br label [[CONT]] ; CHECK: Cont: -; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ -987654321, [[COND]] ], [ 47, [[TMP0:%.*]] ] -; CHECK-NEXT: store i32 [[STOREMERGE]], ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret void ; store i32 47, ptr %P, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -499,14 +499,15 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR2:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42 ; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]] ; CHECK: cond_store: ; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[ADD]], 1 +; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[TMP5]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ] -; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]