diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-03-25-DSEMiscompile.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-03-25-DSEMiscompile.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-03-25-DSEMiscompile.ll @@ -0,0 +1,23 @@ +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s +; PR9561 +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" +target triple = "i386-apple-darwin9.8" + +@A = external global [0 x i32] + +declare ghccc void @Func2(i32*, i32*, i32*, i32) + +define ghccc void @Func1(i32* noalias %Arg1, i32* noalias %Arg2, i32* %Arg3, i32 %Arg4) { +entry: + store i32 add (i32 ptrtoint ([0 x i32]* @A to i32), i32 1), i32* %Arg2 +; CHECK: store i32 add (i32 ptrtoint ([0 x i32]* @A to i32), i32 1), i32* %Arg2 + %ln2gz = getelementptr i32, i32* %Arg1, i32 14 + %ln2gA = bitcast i32* %ln2gz to double* + %ln2gB = load double, double* %ln2gA + %ln2gD = getelementptr i32, i32* %Arg2, i32 -3 + %ln2gE = bitcast i32* %ln2gD to double* + store double %ln2gB, double* %ln2gE +; CHECK: store double %ln2gB, double* %ln2gE + tail call ghccc void @Func2(i32* %Arg1, i32* %Arg2, i32* %Arg3, i32 %Arg4) nounwind + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll @@ -0,0 +1,24 @@ +; XFAIL: * +; RUN: opt -dse -enable-dse-memoryssa -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin" + +%"class.std::auto_ptr" = type { i32* } + +; CHECK-LABEL: @_Z3foov( +define void @_Z3foov(%"class.std::auto_ptr"* noalias nocapture sret %agg.result) uwtable ssp { +_ZNSt8auto_ptrIiED1Ev.exit: + %temp.lvalue = alloca %"class.std::auto_ptr", align 8 + call void @_Z3barv(%"class.std::auto_ptr"* sret %temp.lvalue) + %_M_ptr.i.i = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %temp.lvalue, i64 0, i32 0 + %tmp.i.i = load i32*, i32** %_M_ptr.i.i, align 8 +; CHECK-NOT: store i32* null + store i32* null, i32** %_M_ptr.i.i, align 8 + %_M_ptr.i.i4 = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %agg.result, i64 0, i32 0 + store i32* %tmp.i.i, i32** %_M_ptr.i.i4, align 8 +; CHECK: ret void + ret void +} + +declare void @_Z3barv(%"class.std::auto_ptr"* sret) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-MemCpy.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-MemCpy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-MemCpy.ll @@ -0,0 +1,85 @@ +; RUN: opt -dse -enable-dse-memoryssa -S < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +%struct.pair.162 = type { %struct.BasicBlock*, i32, [4 x i8] } +%struct.BasicBlock = type { %struct.Value, %struct.ilist_node.24, %struct.iplist.22, %struct.Function* } +%struct.Value = type { i32 (...)**, i8, i8, i16, %struct.Type*, %struct.Use*, %struct.StringMapEntry* } +%struct.Type = type { %struct.LLVMContext*, i8, [3 x i8], i32, {}* } +%struct.LLVMContext = type { %struct.LLVMContextImpl* } +%struct.LLVMContextImpl = type opaque +%struct.Use = type { %struct.Value*, %struct.Use*, %struct.PointerIntPair } +%struct.PointerIntPair = type { i64 } +%struct.StringMapEntry = type opaque +%struct.ilist_node.24 = type { %struct.ilist_half_node.23, %struct.BasicBlock* } +%struct.ilist_half_node.23 = type { %struct.BasicBlock* } +%struct.iplist.22 = type { %struct.ilist_traits.21, %struct.Instruction* } +%struct.ilist_traits.21 = type { %struct.ilist_half_node.25 } +%struct.ilist_half_node.25 = type { %struct.Instruction* } +%struct.Instruction = type { [52 x i8], %struct.ilist_node.26, %struct.BasicBlock*, %struct.DebugLoc } +%struct.ilist_node.26 = type { %struct.ilist_half_node.25, %struct.Instruction* } +%struct.DebugLoc = type { i32, i32 } +%struct.Function = type { %struct.GlobalValue, %struct.ilist_node.14, %struct.iplist.4, %struct.iplist, %struct.ValueSymbolTable*, %struct.AttrListPtr } +%struct.GlobalValue = type <{ [52 x i8], [4 x i8], %struct.Module*, i8, i16, [5 x i8], %struct.basic_string }> +%struct.Module = type { %struct.LLVMContext*, %struct.iplist.20, %struct.iplist.16, %struct.iplist.12, %struct.vector.2, %struct.ilist, %struct.basic_string, %struct.ValueSymbolTable*, %struct.OwningPtr, %struct.basic_string, %struct.basic_string, %struct.basic_string, i8* } +%struct.iplist.20 = type { %struct.ilist_traits.19, %struct.GlobalVariable* } +%struct.ilist_traits.19 = type { %struct.ilist_node.18 } +%struct.ilist_node.18 = type { %struct.ilist_half_node.17, %struct.GlobalVariable* } +%struct.ilist_half_node.17 = type { %struct.GlobalVariable* } +%struct.GlobalVariable = type { %struct.GlobalValue, %struct.ilist_node.18, i8, [7 x i8] } +%struct.iplist.16 = type { %struct.ilist_traits.15, %struct.Function* } +%struct.ilist_traits.15 = type { %struct.ilist_node.14 } +%struct.ilist_node.14 = type { %struct.ilist_half_node.13, %struct.Function* } +%struct.ilist_half_node.13 = type { %struct.Function* } +%struct.iplist.12 = type { %struct.ilist_traits.11, %struct.GlobalAlias* } +%struct.ilist_traits.11 = type { %struct.ilist_node.10 } +%struct.ilist_node.10 = type { %struct.ilist_half_node.9, %struct.GlobalAlias* } +%struct.ilist_half_node.9 = type { %struct.GlobalAlias* } +%struct.GlobalAlias = type { %struct.GlobalValue, %struct.ilist_node.10 } +%struct.vector.2 = type { %struct._Vector_base.1 } +%struct._Vector_base.1 = type { %struct._Vector_impl.0 } +%struct._Vector_impl.0 = type { %struct.basic_string*, %struct.basic_string*, %struct.basic_string* } +%struct.basic_string = type { %struct._Alloc_hider } +%struct._Alloc_hider = type { i8* } +%struct.ilist = type { %struct.iplist.8 } +%struct.iplist.8 = type { %struct.ilist_traits.7, %struct.NamedMDNode* } +%struct.ilist_traits.7 = type { %struct.ilist_node.6 } +%struct.ilist_node.6 = type { %struct.ilist_half_node.5, %struct.NamedMDNode* } +%struct.ilist_half_node.5 = type { %struct.NamedMDNode* } +%struct.NamedMDNode = type { %struct.ilist_node.6, %struct.basic_string, %struct.Module*, i8* } +%struct.ValueSymbolTable = type opaque +%struct.OwningPtr = type { %struct.GVMaterializer* } +%struct.GVMaterializer = type opaque +%struct.iplist.4 = type { %struct.ilist_traits.3, %struct.BasicBlock* } +%struct.ilist_traits.3 = type { %struct.ilist_half_node.23 } +%struct.iplist = type { %struct.ilist_traits, %struct.Argument* } +%struct.ilist_traits = type { %struct.ilist_half_node } +%struct.ilist_half_node = type { %struct.Argument* } +%struct.Argument = type { %struct.Value, %struct.ilist_node, %struct.Function* } +%struct.ilist_node = type { %struct.ilist_half_node, %struct.Argument* } +%struct.AttrListPtr = type { %struct.AttributeListImpl* } +%struct.AttributeListImpl = type opaque + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +; CHECK: _ZSt9iter_swapIPSt4pairIPN4llvm10BasicBlockEjES5_EvT_T0_ +; CHECK: store +; CHECK: ret void +define void @_ZSt9iter_swapIPSt4pairIPN4llvm10BasicBlockEjES5_EvT_T0_(%struct.pair.162* %__a, %struct.pair.162* %__b) nounwind uwtable inlinehint { +entry: + %memtmp = alloca %struct.pair.162, align 8 + %0 = getelementptr inbounds %struct.pair.162, %struct.pair.162* %memtmp, i64 0, i32 0 + %1 = getelementptr inbounds %struct.pair.162, %struct.pair.162* %__a, i64 0, i32 0 + %2 = load %struct.BasicBlock*, %struct.BasicBlock** %1, align 8 + store %struct.BasicBlock* %2, %struct.BasicBlock** %0, align 8 + %3 = getelementptr inbounds %struct.pair.162, %struct.pair.162* %memtmp, i64 0, i32 1 + %4 = getelementptr inbounds %struct.pair.162, %struct.pair.162* %__a, i64 0, i32 1 + %5 = load i32, i32* %4, align 4 + store i32 %5, i32* %3, align 8 + %6 = bitcast %struct.pair.162* %__a to i8* + %7 = bitcast %struct.pair.162* %__b to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 12, i1 false) + %8 = bitcast %struct.pair.162* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 12, i1 false) + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/2016-07-17-UseAfterFree.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/2016-07-17-UseAfterFree.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/2016-07-17-UseAfterFree.ll @@ -0,0 +1,33 @@ +; XFAIL: * +; RUN: opt < %s -basicaa -dse-enable-dse-memoryssa -S -enable-dse-partial-overwrite-tracking | FileCheck %s +; PR28588 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @_UPT_destroy(i8* nocapture %ptr) local_unnamed_addr #0 { +entry: + %edi = getelementptr inbounds i8, i8* %ptr, i64 8 + +; CHECK-NOT: tail call void @llvm.memset.p0i8.i64(i8* align 8 %edi, i8 0, i64 176, i1 false) +; CHECK-NOT: store i32 -1, i32* %addr + + tail call void @llvm.memset.p0i8.i64(i8* align 8 %edi, i8 0, i64 176, i1 false) + %format4.i = getelementptr inbounds i8, i8* %ptr, i64 144 + %addr = bitcast i8* %format4.i to i32* + store i32 -1, i32* %addr, align 8 + +; CHECK: tail call void @free + tail call void @free(i8* nonnull %ptr) + ret void +} + +; Function Attrs: nounwind +declare void @free(i8* nocapture) local_unnamed_addr #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll @@ -0,0 +1,394 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +define void @write4to7(i32* nocapture %p) { +; CHECK-LABEL: @write4to7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i1 false) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @write4to7_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write4to7_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 + store atomic i32 1, i32* %arrayidx1 unordered, align 4 + ret void +} + +define void @write0to3(i32* nocapture %p) { +; CHECK-LABEL: @write0to3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) +; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i1 false) + store i32 1, i32* %p, align 4 + ret void +} + +define void @write0to3_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write0to3_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) +; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + store atomic i32 1, i32* %p unordered, align 4 + ret void +} + +; Atomicity of the store is weaker from the memset +define void @write0to3_atomic_weaker(i32* nocapture %p) { +; CHECK-LABEL: @write0to3_atomic_weaker( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) +; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + store i32 1, i32* %p, align 4 + ret void +} + +define void @write0to7(i32* nocapture %p) { +; CHECK-LABEL: @write0to7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i1 false) + %p4 = bitcast i32* %p to i64* + store i64 1, i64* %p4, align 8 + ret void +} + +; Changing the memset start and length is okay here because the +; store is a multiple of the memset element size +define void @write0to7_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write0to7_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* +; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i32 4) + %p4 = bitcast i32* %p to i64* + store atomic i64 1, i64* %p4 unordered, align 8 + ret void +} + +define void @write0to7_2(i32* nocapture %p) { +; CHECK-LABEL: @write0to7_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i1 false) + %p4 = bitcast i32* %p to i64* + store i64 1, i64* %p4, align 8 + ret void +} + +define void @write0to7_2_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write0to7_2_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* +; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + %p4 = bitcast i32* %p to i64* + store atomic i64 1, i64* %p4 unordered, align 8 + ret void +} + +; We do not trim the beginning of the eariler write if the alignment of the +; start pointer is changed. +define void @dontwrite0to3_align8(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite0to3_align8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: store i32 1, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %p3, i8 0, i64 32, i1 false) + store i32 1, i32* %p, align 4 + ret void +} + +define void @dontwrite0to3_align8_atomic(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite0to3_align8_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %p3, i8 0, i64 32, i32 4) + store atomic i32 1, i32* %p unordered, align 4 + ret void +} + +define void @dontwrite0to1(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite0to1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* +; CHECK-NEXT: store i16 1, i16* [[P4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i1 false) + %p4 = bitcast i32* %p to i16* + store i16 1, i16* %p4, align 4 + ret void +} + +define void @dontwrite0to1_atomic(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite0to1_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* +; CHECK-NEXT: store atomic i16 1, i16* [[P4]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i32 4) + %p4 = bitcast i32* %p to i16* + store atomic i16 1, i16* %p4 unordered, align 4 + ret void +} + +define void @dontwrite2to9(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite2to9( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 +; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P5]], align 8 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i1 false) + %p4 = bitcast i32* %p to i16* + %arrayidx2 = getelementptr inbounds i16, i16* %p4, i64 1 + %p5 = bitcast i16* %arrayidx2 to i64* + store i64 1, i64* %p5, align 8 + ret void +} + +define void @dontwrite2to9_atomic(i32* nocapture %p) { +; CHECK-LABEL: @dontwrite2to9_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 +; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64* +; CHECK-NEXT: store atomic i64 1, i64* [[P5]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i32 4) + %p4 = bitcast i32* %p to i16* + %arrayidx2 = getelementptr inbounds i16, i16* %p4, i64 1 + %p5 = bitcast i16* %arrayidx2 to i64* + store atomic i64 1, i64* %p5 unordered, align 8 + ret void +} + +define void @write8To15AndThen0To7(i64* nocapture %P) { +; CHECK-LABEL: @write8To15AndThen0To7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i1 false) +; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 +; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 +; CHECK-NEXT: store i64 1, i64* [[BASE64_1]] +; CHECK-NEXT: store i64 2, i64* [[BASE64_0]] +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i1 false) + + %base64_0 = getelementptr inbounds i64, i64* %P, i64 0 + %base64_1 = getelementptr inbounds i64, i64* %P, i64 1 + + store i64 1, i64* %base64_1 + store i64 2, i64* %base64_0 + ret void +} + +define void @write8To15AndThen0To7_atomic(i64* nocapture %P) { +; CHECK-LABEL: @write8To15AndThen0To7_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 +; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 +; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 +; CHECK-NEXT: store atomic i64 2, i64* [[BASE64_0]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_0 = getelementptr inbounds i64, i64* %P, i64 0 + %base64_1 = getelementptr inbounds i64, i64* %P, i64 1 + + store atomic i64 1, i64* %base64_1 unordered, align 8 + store atomic i64 2, i64* %base64_0 unordered, align 8 + ret void +} + +define void @write8To15AndThen0To7_atomic_weaker(i64* nocapture %P) { +; CHECK-LABEL: @write8To15AndThen0To7_atomic_weaker( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 +; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 +; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 +; CHECK-NEXT: store i64 2, i64* [[BASE64_0]], align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_0 = getelementptr inbounds i64, i64* %P, i64 0 + %base64_1 = getelementptr inbounds i64, i64* %P, i64 1 + + store atomic i64 1, i64* %base64_1 unordered, align 8 + store i64 2, i64* %base64_0, align 8 + ret void +} + +define void @write8To15AndThen0To7_atomic_weaker_2(i64* nocapture %P) { +; CHECK-LABEL: @write8To15AndThen0To7_atomic_weaker_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 +; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 +; CHECK-NEXT: store i64 1, i64* [[BASE64_1]], align 8 +; CHECK-NEXT: store atomic i64 2, i64* [[BASE64_0]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_0 = getelementptr inbounds i64, i64* %P, i64 0 + %base64_1 = getelementptr inbounds i64, i64* %P, i64 1 + + store i64 1, i64* %base64_1, align 8 + store atomic i64 2, i64* %base64_0 unordered, align 8 + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll @@ -0,0 +1,391 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +%struct.vec2 = type { <4 x i32>, <4 x i32> } +%struct.vec2plusi = type { <4 x i32>, <4 x i32>, i32 } + +@glob1 = global %struct.vec2 zeroinitializer, align 16 +@glob2 = global %struct.vec2plusi zeroinitializer, align 16 + +define void @write24to28(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write24to28( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i1 false) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i1 false) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @write24to28_atomic(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write24to28_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store atomic i32 1, i32* %arrayidx1 unordered, align 4 + ret void +} + +; Atomicity of the store is weaker from the memset +define void @write24to28_atomic_weaker(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write24to28_atomic_weaker( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %arrayidx0 = getelementptr inbounds i32, i32* %p, i64 1 + %p3 = bitcast i32* %arrayidx0 to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 28, i32 4) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @write28to32(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write28to32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i1 false) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @write28to32_atomic(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write28to32_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %p3, i8 0, i64 32, i32 4) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store atomic i32 1, i32* %arrayidx1 unordered, align 4 + ret void +} + +define void @dontwrite28to32memset(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @dontwrite28to32memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.p0i8.i64(i8* align 16 %p3, i8 0, i64 32, i1 false) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @dontwrite28to32memset_atomic(i32* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @dontwrite28to32memset_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 16 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %p3 = bitcast i32* %p to i8* + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 16 %p3, i8 0, i64 32, i32 4) + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 7 + store atomic i32 1, i32* %arrayidx1 unordered, align 4 + ret void +} + +define void @write32to36(%struct.vec2plusi* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write32to36( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i1 false) +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 +; CHECK-NEXT: store i32 1, i32* [[C]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2plusi* %p to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i1 false) + %c = getelementptr inbounds %struct.vec2plusi, %struct.vec2plusi* %p, i64 0, i32 2 + store i32 1, i32* %c, align 4 + ret void +} + +define void @write32to36_atomic(%struct.vec2plusi* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write32to36_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 +; CHECK-NEXT: store atomic i32 1, i32* [[C]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2plusi* %p to i8* + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) + %c = getelementptr inbounds %struct.vec2plusi, %struct.vec2plusi* %p, i64 0, i32 2 + store atomic i32 1, i32* %c unordered, align 4 + ret void +} + +; Atomicity of the store is weaker than the memcpy +define void @write32to36_atomic_weaker(%struct.vec2plusi* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write32to36_atomic_weaker( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 +; CHECK-NEXT: store i32 1, i32* [[C]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2plusi* %p to i8* + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) + %c = getelementptr inbounds %struct.vec2plusi, %struct.vec2plusi* %p, i64 0, i32 2 + store i32 1, i32* %c, align 4 + ret void +} + +define void @write16to32(%struct.vec2* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write16to32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i1 false) +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1 +; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[C]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2* %p to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i1 false) + %c = getelementptr inbounds %struct.vec2, %struct.vec2* %p, i64 0, i32 1 + store <4 x i32> , <4 x i32>* %c, align 4 + ret void +} + +define void @write16to32_atomic(%struct.vec2* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @write16to32_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i32 4) +; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1 +; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[C]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2* %p to i8* + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4) + %c = getelementptr inbounds %struct.vec2, %struct.vec2* %p, i64 0, i32 1 + store <4 x i32> , <4 x i32>* %c, align 4 + ret void +} + +define void @dontwrite28to32memcpy(%struct.vec2* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @dontwrite28to32memcpy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i1 false) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2* %p to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i1 false) + %arrayidx1 = getelementptr inbounds %struct.vec2, %struct.vec2* %p, i64 0, i32 0, i64 7 + store i32 1, i32* %arrayidx1, align 4 + ret void +} + +define void @dontwrite28to32memcpy_atomic(%struct.vec2* nocapture %p) nounwind uwtable ssp { +; CHECK-LABEL: @dontwrite28to32memcpy_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4) +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast %struct.vec2* %p to i8* + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4) + %arrayidx1 = getelementptr inbounds %struct.vec2, %struct.vec2* %p, i64 0, i32 0, i64 7 + store atomic i32 1, i32* %arrayidx1 unordered, align 4 + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind + +%struct.trapframe = type { i64, i64, i64 } + +; bugzilla 11455 - make sure negative GEP's don't break this optimisation +define void @cpu_lwp_fork(%struct.trapframe* %md_regs, i64 %pcb_rsp0) nounwind uwtable noinline ssp { +; CHECK-LABEL: @cpu_lwp_fork( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[PCB_RSP0:%.*]] to %struct.trapframe* +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds [[STRUCT_TRAPFRAME:%.*]], %struct.trapframe* [[TMP0]], i64 -1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.trapframe* [[ADD_PTR]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.trapframe* [[MD_REGS:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]], i8* [[TMP2]], i64 24, i1 false) +; CHECK-NEXT: [[TF_TRAPNO:%.*]] = getelementptr inbounds [[STRUCT_TRAPFRAME]], %struct.trapframe* [[TMP0]], i64 -1, i32 1 +; CHECK-NEXT: store i64 3, i64* [[TF_TRAPNO]], align 8 +; CHECK-NEXT: ret void +; +entry: + %0 = inttoptr i64 %pcb_rsp0 to %struct.trapframe* + %add.ptr = getelementptr inbounds %struct.trapframe, %struct.trapframe* %0, i64 -1 + %1 = bitcast %struct.trapframe* %add.ptr to i8* + %2 = bitcast %struct.trapframe* %md_regs to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 24, i1 false) + %tf_trapno = getelementptr inbounds %struct.trapframe, %struct.trapframe* %0, i64 -1, i32 1 + store i64 3, i64* %tf_trapno, align 8 + ret void +} + +define void @write16To23AndThen24To31(i64* nocapture %P, i64 %n64, i32 %n32, i16 %n16, i8 %n8) { +; CHECK-LABEL: @write16To23AndThen24To31( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i1 false) +; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 +; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 +; CHECK-NEXT: store i64 3, i64* [[BASE64_2]] +; CHECK-NEXT: store i64 3, i64* [[BASE64_3]] +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i1 false) + + %base64_2 = getelementptr inbounds i64, i64* %P, i64 2 + %base64_3 = getelementptr inbounds i64, i64* %P, i64 3 + + store i64 3, i64* %base64_2 + store i64 3, i64* %base64_3 + ret void +} + +define void @write16To23AndThen24To31_atomic(i64* nocapture %P, i64 %n64, i32 %n32, i16 %n16, i8 %n8) { +; CHECK-LABEL: @write16To23AndThen24To31_atomic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 +; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 +; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8 +; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_3]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_2 = getelementptr inbounds i64, i64* %P, i64 2 + %base64_3 = getelementptr inbounds i64, i64* %P, i64 3 + + store atomic i64 3, i64* %base64_2 unordered, align 8 + store atomic i64 3, i64* %base64_3 unordered, align 8 + ret void +} + +define void @write16To23AndThen24To31_atomic_weaker1(i64* nocapture %P, i64 %n64, i32 %n32, i16 %n16, i8 %n8) { +; CHECK-LABEL: @write16To23AndThen24To31_atomic_weaker1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 +; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 +; CHECK-NEXT: store i64 3, i64* [[BASE64_2]], align 8 +; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_3]] unordered, align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_2 = getelementptr inbounds i64, i64* %P, i64 2 + %base64_3 = getelementptr inbounds i64, i64* %P, i64 3 + + store i64 3, i64* %base64_2, align 8 + store atomic i64 3, i64* %base64_3 unordered, align 8 + ret void +} + +define void @write16To23AndThen24To31_atomic_weaker2(i64* nocapture %P, i64 %n64, i32 %n32, i16 %n16, i8 %n8) { +; CHECK-LABEL: @write16To23AndThen24To31_atomic_weaker2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* +; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) +; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 +; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 +; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8 +; CHECK-NEXT: store i64 3, i64* [[BASE64_3]], align 8 +; CHECK-NEXT: ret void +; +entry: + + %base0 = bitcast i64* %P to i8* + %mybase0 = getelementptr inbounds i8, i8* %base0, i64 0 + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %mybase0, i8 0, i64 32, i32 8) + + %base64_2 = getelementptr inbounds i64, i64* %P, i64 2 + %base64_3 = getelementptr inbounds i64, i64* %P, i64 3 + + store atomic i64 3, i64* %base64_2 unordered, align 8 + store i64 3, i64* %base64_3, align 8 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore.ll @@ -0,0 +1,87 @@ +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -S | FileCheck %s +target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +; Ensure that the dead store is deleted in this case. It is wholely +; overwritten by the second store. +define void @test1(i32 *%V) { + %V2 = bitcast i32* %V to i8* ; [#uses=1] + store i8 0, i8* %V2 + store i32 1234567, i32* %V + ret void +; CHECK-LABEL: @test1( +; CHECK-NEXT: store i32 1234567 +} + +; Note that we could do better by merging the two stores into one. +define void @test2(i32* %P) { +; CHECK-LABEL: @test2( + store i32 0, i32* %P +; CHECK: store i32 + %Q = bitcast i32* %P to i16* + store i16 1, i16* %Q +; CHECK: store i16 + ret void +} + + +define i32 @test3(double %__x) { +; CHECK-LABEL: @test3( +; CHECK: store double + %__u = alloca { [3 x i32] } + %tmp.1 = bitcast { [3 x i32] }* %__u to double* + store double %__x, double* %tmp.1 + %tmp.4 = getelementptr { [3 x i32] }, { [3 x i32] }* %__u, i32 0, i32 0, i32 1 + %tmp.5 = load i32, i32* %tmp.4 + %tmp.6 = icmp slt i32 %tmp.5, 0 + %tmp.7 = zext i1 %tmp.6 to i32 + ret i32 %tmp.7 +} + +; PR6043 +define void @test4(i8* %P) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: bitcast +; CHECK-NEXT: store double + + store i8 19, i8* %P ;; dead + %A = getelementptr i8, i8* %P, i32 3 + + store i8 42, i8* %A ;; dead + + %Q = bitcast i8* %P to double* + store double 0.0, double* %Q + ret void +} + +; PR8657 +declare void @test5a(i32*) +define void @test5(i32 %i) nounwind ssp { + %A = alloca i32 + %B = bitcast i32* %A to i8* + %C = getelementptr i8, i8* %B, i32 %i + store i8 10, i8* %C ;; Dead store to variable index. + store i32 20, i32* %A + + call void @test5a(i32* %A) + ret void +; CHECK-LABEL: @test5( +; CHECK-NEXT: alloca +; CHECK-NEXT: store i32 20 +; CHECK-NEXT: call void @test5a +} + +declare void @test5a_as1(i32*) +define void @test5_addrspacecast(i32 %i) nounwind ssp { + %A = alloca i32 + %B = addrspacecast i32* %A to i8 addrspace(1)* + %C = getelementptr i8, i8 addrspace(1)* %B, i32 %i + store i8 10, i8 addrspace(1)* %C ;; Dead store to variable index. + store i32 20, i32* %A + + call void @test5a(i32* %A) + ret void +; CHECK-LABEL: @test5_addrspacecast( +; CHECK-NEXT: alloca +; CHECK-NEXT: store i32 20 +; CHECK-NEXT: call void @test5a +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore2.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/PartialStore2.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s --data-layout "e" -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=true -S | FileCheck --check-prefix CHECK --check-prefix CHECK-LE %s +; RUN: opt < %s --data-layout "E" -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=true -S | FileCheck --check-prefix CHECK --check-prefix CHECK-BE %s + +; This test used to hit an assertion (see PR41949). +; +; Better safe than sorry, do not assume anything about the padding for the +; i28 store that has 32 bits as store size. +define void @test1(i32* %p) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[A:%.*]] = alloca i32 +; CHECK-NEXT: [[B:%.*]] = bitcast i32* [[A]] to i28* +; CHECK-NEXT: [[C:%.*]] = bitcast i32* [[A]] to { i16, i16 }* +; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds { i16, i16 }, { i16, i16 }* [[C]], i32 0, i32 1 +; CHECK-NEXT: store i28 10, i28* [[B]] +; CHECK-NEXT: store i16 20, i16* [[C1]] +; CHECK-NEXT: call void @test1(i32* [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca i32 + %b = bitcast i32* %a to i28* + %c = bitcast i32* %a to { i16, i16 }* + %c1 = getelementptr inbounds { i16, i16 }, { i16, i16 }* %c, i32 0, i32 1 + store i28 10, i28* %b + store i16 20, i16* %c1 + + call void @test1(i32* %a) + ret void +} + + +; This test used to mis-compile (see PR41949). +; +; Better safe than sorry, do not assume anything about the padding for the +; i12 store that has 16 bits as store size. +define void @test2(i32* %p) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[U:%.*]] = alloca i32 +; CHECK-NEXT: [[A:%.*]] = bitcast i32* [[U]] to i32* +; CHECK-NEXT: [[B:%.*]] = bitcast i32* [[U]] to i12* +; CHECK-NEXT: store i32 -1, i32* [[A]] +; CHECK-NEXT: store i12 20, i12* [[B]] +; CHECK-NEXT: call void @test2(i32* [[U]]) +; CHECK-NEXT: ret void +; + %u = alloca i32 + %a = bitcast i32* %u to i32* + %b = bitcast i32* %u to i12* + store i32 -1, i32* %a + store i12 20, i12* %b + + call void @test2(i32* %u) + ret void +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/X86/gather-null-pointer.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/X86/gather-null-pointer.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/X86/gather-null-pointer.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -dse -enable-dse-memoryssa -S | FileCheck %s + +; Both stores should be emitted because we can't tell if the gather aliases. + +define <4 x i32> @bar(<4 x i32> %arg, i32* %arg1) { +; CHECK-LABEL: @bar( +; CHECK-NEXT: bb: +; CHECK-NEXT: store i32 5, i32* [[ARG1:%.*]] +; CHECK-NEXT: [[TMP:%.*]] = tail call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> zeroinitializer, i8* null, <4 x i32> [[ARG:%.*]], <4 x i32> , i8 1) +; CHECK-NEXT: store i32 10, i32* [[ARG1]] +; CHECK-NEXT: ret <4 x i32> [[TMP]] +; +bb: + store i32 5, i32* %arg1 + %tmp = tail call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> zeroinitializer, i8* null, <4 x i32> %arg, <4 x i32> , i8 1) + store i32 10, i32* %arg1 + ret <4 x i32> %tmp +} + +declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll @@ -0,0 +1,133 @@ +; XFAIL: * +; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-macosx10.7.0" + +; Sanity tests for atomic stores. +; Note that it turns out essentially every transformation DSE does is legal on +; atomic ops, just some transformations are not allowed across release-acquire pairs. + +@x = common global i32 0, align 4 +@y = common global i32 0, align 4 + +declare void @randomop(i32*) + +; DSE across unordered store (allowed) +define void @test1() { +; CHECK-LABEL: test1 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + store atomic i32 0, i32* @y unordered, align 4 + store i32 1, i32* @x + ret void +} + +; DSE remove unordered store (allowed) +define void @test4() { +; CHECK-LABEL: test4 +; CHECK-NOT: store atomic +; CHECK: store i32 1 + store atomic i32 0, i32* @x unordered, align 4 + store i32 1, i32* @x + ret void +} + +; DSE unordered store overwriting non-atomic store (allowed) +define void @test5() { +; CHECK-LABEL: test5 +; CHECK: store atomic i32 1 + store i32 0, i32* @x + store atomic i32 1, i32* @x unordered, align 4 + ret void +} + +; DSE no-op unordered atomic store (allowed) +define void @test6() { +; CHECK-LABEL: test6 +; CHECK-NOT: store +; CHECK: ret void + %x = load atomic i32, i32* @x unordered, align 4 + store atomic i32 %x, i32* @x unordered, align 4 + ret void +} + +; DSE seq_cst store (be conservative; DSE doesn't have infrastructure +; to reason about atomic operations). +define void @test7() { +; CHECK-LABEL: test7 +; CHECK: store atomic + %a = alloca i32 + store atomic i32 0, i32* %a seq_cst, align 4 + ret void +} + +; DSE and seq_cst load (be conservative; DSE doesn't have infrastructure +; to reason about atomic operations). +define i32 @test8() { +; CHECK-LABEL: test8 +; CHECK: store +; CHECK: load atomic + %a = alloca i32 + call void @randomop(i32* %a) + store i32 0, i32* %a, align 4 + %x = load atomic i32, i32* @x seq_cst, align 4 + ret i32 %x +} + +; DSE across monotonic load (allowed as long as the eliminated store isUnordered) +define i32 @test9() { +; CHECK-LABEL: test9 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + %x = load atomic i32, i32* @y monotonic, align 4 + store i32 1, i32* @x + ret i32 %x +} + +; DSE across monotonic store (allowed as long as the eliminated store isUnordered) +define void @test10() { +; CHECK-LABEL: test10 +; CHECK-NOT: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + store atomic i32 42, i32* @y monotonic, align 4 + store i32 1, i32* @x + ret void +} + +; DSE across monotonic load (forbidden since the eliminated store is atomic) +define i32 @test11() { +; CHECK-LABEL: test11 +; CHECK: store atomic i32 0 +; CHECK: store atomic i32 1 + store atomic i32 0, i32* @x monotonic, align 4 + %x = load atomic i32, i32* @y monotonic, align 4 + store atomic i32 1, i32* @x monotonic, align 4 + ret i32 %x +} + +; DSE across monotonic store (forbidden since the eliminated store is atomic) +define void @test12() { +; CHECK-LABEL: test12 +; CHECK: store atomic i32 0 +; CHECK: store atomic i32 1 + store atomic i32 0, i32* @x monotonic, align 4 + store atomic i32 42, i32* @y monotonic, align 4 + store atomic i32 1, i32* @x monotonic, align 4 + ret void +} + +; But DSE is not allowed across a release-acquire pair. +define i32 @test15() { +; CHECK-LABEL: test15 +; CHECK: store i32 0 +; CHECK: store i32 1 + store i32 0, i32* @x + store atomic i32 0, i32* @y release, align 4 + %x = load atomic i32, i32* @y acquire, align 4 + store i32 1, i32* @x + ret i32 %x +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll @@ -0,0 +1,66 @@ +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +declare noalias i8* @calloc(i64, i64) + +define i32* @test1() { +; CHECK-LABEL: test1 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + %2 = bitcast i8* %1 to i32* + ; This store is dead and should be removed + store i32 0, i32* %2, align 4 +; CHECK-NOT: store i32 0, i32* %2, align 4 + ret i32* %2 +} + +define i32* @test2() { +; CHECK-LABEL: test2 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + %2 = bitcast i8* %1 to i32* + %3 = getelementptr i32, i32* %2, i32 5 + store i32 0, i32* %3, align 4 +; CHECK-NOT: store i32 0, i32* %2, align 4 + ret i32* %2 +} + +define i32* @test3(i32 *%arg) { +; CHECK-LABEL: test3 + store i32 0, i32* %arg, align 4 +; CHECK: store i32 0, i32* %arg, align 4 + ret i32* %arg +} + +declare void @clobber_memory(i8*) +define i8* @test4() { +; CHECK-LABEL: test4 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + call void @clobber_memory(i8* %1) + store i8 0, i8* %1, align 4 +; CHECK: store i8 0, i8* %1, align 4 + ret i8* %1 +} + +define i32* @test5() { +; CHECK-LABEL: test5 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + %2 = bitcast i8* %1 to i32* + store volatile i32 0, i32* %2, align 4 +; CHECK: store volatile i32 0, i32* %2, align 4 + ret i32* %2 +} + +define i8* @test6() { +; CHECK-LABEL: test6 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + store i8 5, i8* %1, align 4 +; CHECK: store i8 5, i8* %1, align 4 + ret i8* %1 +} + +define i8* @test7(i8 %arg) { +; CHECK-LABEL: test7 + %1 = tail call noalias i8* @calloc(i64 1, i64 4) + store i8 %arg, i8* %1, align 4 +; CHECK: store i8 %arg, i8* %1, align 4 + ret i8* %1 +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -0,0 +1,240 @@ +; XFAIL: * +; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" + +%"struct.std::complex" = type { { float, float } } + +define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) { +entry: +; CHECK-LABEL: @_Z4testSt7complexIfE + + %ref.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"* + %c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32 + %c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32 + %0 = bitcast i32 %c.sroa.0.0.extract.trunc to float + %c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32 + %1 = bitcast i32 %c.sroa.2.0.extract.trunc to float + call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce) + %2 = bitcast %"struct.std::complex"* %agg.result to i64* + %3 = load i64, i64* %ref.tmp, align 8 + store i64 %3, i64* %2, align 4 +; CHECK-NOT: store i64 + + %_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0 + %4 = lshr i64 %3, 32 + %5 = trunc i64 %4 to i32 + %6 = bitcast i32 %5 to float + %_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1 + %7 = trunc i64 %3 to i32 + %8 = bitcast i32 %7 to float + %mul_ad.i.i = fmul fast float %6, %1 + %mul_bc.i.i = fmul fast float %8, %0 + %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i + %mul_ac.i.i = fmul fast float %6, %0 + %mul_bd.i.i = fmul fast float %8, %1 + %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i + store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4 + store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4 + ret void +; CHECK: ret void +} + +declare void @_Z3barSt7complexIfE(%"struct.std::complex"* sret, i64) + +define void @test1(i32 *%ptr) { +entry: +; CHECK-LABEL: @test1 + + store i32 5, i32* %ptr + %bptr = bitcast i32* %ptr to i8* + store i8 7, i8* %bptr + %wptr = bitcast i32* %ptr to i16* + store i16 -30062, i16* %wptr + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + store i8 25, i8* %bptr2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + store i8 47, i8* %bptr3 + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %wptrp = bitcast i8* %bptr1 to i16* + store i16 2020, i16* %wptrp, align 1 + ret void + +; CHECK-NOT: store i32 5, i32* %ptr +; CHECK-NOT: store i8 7, i8* %bptr +; CHECK: store i16 -30062, i16* %wptr +; CHECK-NOT: store i8 25, i8* %bptr2 +; CHECK: store i8 47, i8* %bptr3 +; CHECK: store i16 2020, i16* %wptrp, align 1 + +; CHECK: ret void +} + +define void @test2(i32 *%ptr) { +entry: +; CHECK-LABEL: @test2 + + store i32 5, i32* %ptr + + %bptr = bitcast i32* %ptr to i8* + %bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1 + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + %wptr = bitcast i8* %bptr to i16* + %wptrm1 = bitcast i8* %bptrm1 to i16* + %wptr1 = bitcast i8* %bptr1 to i16* + %wptr2 = bitcast i8* %bptr2 to i16* + %wptr3 = bitcast i8* %bptr3 to i16* + + store i16 1456, i16* %wptrm1, align 1 + store i16 1346, i16* %wptr, align 1 + store i16 1756, i16* %wptr1, align 1 + store i16 1126, i16* %wptr2, align 1 + store i16 5656, i16* %wptr3, align 1 + +; CHECK-NOT: store i32 5, i32* %ptr + +; CHECK: store i16 1456, i16* %wptrm1, align 1 +; CHECK: store i16 1346, i16* %wptr, align 1 +; CHECK: store i16 1756, i16* %wptr1, align 1 +; CHECK: store i16 1126, i16* %wptr2, align 1 +; CHECK: store i16 5656, i16* %wptr3, align 1 + + ret void + +; CHECK: ret void +} + +define signext i8 @test3(i32 *%ptr) { +entry: +; CHECK-LABEL: @test3 + + store i32 5, i32* %ptr + + %bptr = bitcast i32* %ptr to i8* + %bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1 + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + %wptr = bitcast i8* %bptr to i16* + %wptrm1 = bitcast i8* %bptrm1 to i16* + %wptr1 = bitcast i8* %bptr1 to i16* + %wptr2 = bitcast i8* %bptr2 to i16* + %wptr3 = bitcast i8* %bptr3 to i16* + + %v = load i8, i8* %bptr, align 1 + store i16 1456, i16* %wptrm1, align 1 + store i16 1346, i16* %wptr, align 1 + store i16 1756, i16* %wptr1, align 1 + store i16 1126, i16* %wptr2, align 1 + store i16 5656, i16* %wptr3, align 1 + +; CHECK: store i32 5, i32* %ptr + + ret i8 %v + +; CHECK: ret i8 %v +} + +%struct.foostruct = type { +i32 (i8*, i8**, i32, i8, i8*)*, +i32 (i8*, i8**, i32, i8, i8*)*, +i32 (i8*, i8**, i32, i8, i8*)*, +i32 (i8*, i8**, i32, i8, i8*)*, +void (i8*, i32, i32)* +} +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @goFunc(%struct.foostruct*) +declare i32 @fa(i8*, i8**, i32, i8, i8*) + +define void @test4() { +entry: +; CHECK-LABEL: @test4 + + %bang = alloca %struct.foostruct, align 8 + %v1 = bitcast %struct.foostruct* %bang to i8* + call void @llvm.memset.p0i8.i64(i8* align 8 %v1, i8 0, i64 40, i1 false) + %v2 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 0 + store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %v2, align 8 + %v3 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 1 + store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %v3, align 8 + %v4 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 2 + store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %v4, align 8 + %v5 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 3 + store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %v5, align 8 + %v6 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 4 + store void (i8*, i32, i32)* null, void (i8*, i32, i32)** %v6, align 8 + call void @goFunc(%struct.foostruct* %bang) + ret void + +; CHECK-NOT: memset +; CHECK: ret void +} + +define signext i8 @test5(i32 *%ptr) { +entry: +; CHECK-LABEL: @test5 + + store i32 0, i32* %ptr + + %bptr = bitcast i32* %ptr to i8* + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + %wptr = bitcast i8* %bptr to i16* + %wptr1 = bitcast i8* %bptr1 to i16* + %wptr2 = bitcast i8* %bptr2 to i16* + + store i16 65535, i16* %wptr2, align 1 + store i16 1456, i16* %wptr1, align 1 + store i16 1346, i16* %wptr, align 1 + +; CHECK-NOT: store i32 0, i32* %ptr + + ret i8 0 +} + +define signext i8 @test6(i32 *%ptr) { +entry: +; CHECK-LABEL: @test6 + + store i32 0, i32* %ptr + + %bptr = bitcast i32* %ptr to i16* + %bptr1 = getelementptr inbounds i16, i16* %bptr, i64 0 + %bptr2 = getelementptr inbounds i16, i16* %bptr, i64 1 + + store i16 1456, i16* %bptr2, align 1 + store i16 65535, i16* %bptr1, align 1 + +; CHECK-NOT: store i32 0, i32* %ptr + + ret i8 0 +} + +define signext i8 @test7(i64 *%ptr) { +entry: +; CHECK-LABEL: @test7 + + store i64 0, i64* %ptr + + %bptr = bitcast i64* %ptr to i16* + %bptr1 = getelementptr inbounds i16, i16* %bptr, i64 0 + %bptr2 = getelementptr inbounds i16, i16* %bptr, i64 1 + %bptr3 = getelementptr inbounds i16, i16* %bptr, i64 2 + %bptr4 = getelementptr inbounds i16, i16* %bptr, i64 3 + + store i16 1346, i16* %bptr1, align 1 + store i16 1756, i16* %bptr3, align 1 + store i16 1456, i16* %bptr2, align 1 + store i16 5656, i16* %bptr4, align 1 + +; CHECK-NOT: store i64 0, i64* %ptr + + ret i8 0 +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/const-pointers.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/const-pointers.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/const-pointers.ll @@ -0,0 +1,40 @@ +; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%t = type { i32 } + +@g = global i32 42 + +define void @test1(%t* noalias %pp) { + %p = getelementptr inbounds %t, %t* %pp, i32 0, i32 0 + + store i32 1, i32* %p; <-- This is dead + %x = load i32, i32* inttoptr (i32 12345 to i32*) + store i32 %x, i32* %p + ret void +; CHECK-LABEL: define void @test1( +; CHECK: store +; CHECK-NOT: store +; CHECK: ret void +} + +define void @test3() { + store i32 1, i32* @g; <-- This is dead. + store i32 42, i32* @g + ret void +; CHECK-LABEL: define void @test3( +; CHECK: store +; CHECK-NOT: store +; CHECK: ret void +} + +define void @test4(i32* %p) { + store i32 1, i32* %p + %x = load i32, i32* @g; <-- %p and @g could alias + store i32 %x, i32* %p + ret void +; CHECK-LABEL: define void @test4( +; CHECK: store +; CHECK: store +; CHECK: ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/crash.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/crash.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/crash.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin10.0" + +@g80 = external global i8 ; [#uses=3] + +declare signext i8 @foo(i8 signext, i8 signext) nounwind readnone ssp + +declare i32 @func68(i32) nounwind readonly ssp + +; PR4815 +define void @test1(i32 %int32p54) noreturn nounwind ssp { +entry: + br label %bb + +bb: ; preds = %bb, %entry + %storemerge = phi i8 [ %2, %bb ], [ 1, %entry ] ; [#uses=1] + store i8 %storemerge, i8* @g80 + %0 = tail call i32 @func68(i32 1) nounwind ssp ; [#uses=1] + %1 = trunc i32 %0 to i8 ; [#uses=1] + store i8 %1, i8* @g80, align 1 + store i8 undef, i8* @g80, align 1 + %2 = tail call signext i8 @foo(i8 signext undef, i8 signext 1) nounwind ; [#uses=1] + br label %bb +} + +define fastcc i32 @test2() nounwind ssp { +bb14: ; preds = %bb4 + %0 = bitcast i8* undef to i8** ; [#uses=1] + %1 = getelementptr inbounds i8*, i8** %0, i64 undef ; [#uses=1] + %2 = bitcast i8** %1 to i16* ; [#uses=2] + %3 = getelementptr inbounds i16, i16* %2, i64 undef ; [#uses=1] + %4 = bitcast i16* %3 to i8* ; [#uses=1] + %5 = getelementptr inbounds i8, i8* %4, i64 undef ; [#uses=1] + %6 = getelementptr inbounds i16, i16* %2, i64 undef ; [#uses=1] + store i16 undef, i16* %6, align 2 + %7 = getelementptr inbounds i8, i8* %5, i64 undef ; [#uses=1] + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* undef, i64 undef, i1 false) + unreachable +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + + +; rdar://7635088 +define i32 @test3() { +entry: + ret i32 0 + +dead: + %P2 = getelementptr i32, i32 *%P2, i32 52 + %Q2 = getelementptr i32, i32 *%Q2, i32 52 + store i32 4, i32* %P2 + store i32 4, i32* %Q2 + br label %dead +} + + +; PR3141 +%struct.ada__tags__dispatch_table = type { [1 x i32] } +%struct.f393a00_1__object = type { %struct.ada__tags__dispatch_table*, i8 } +%struct.f393a00_2__windmill = type { %struct.f393a00_1__object, i16 } + +define void @test4(%struct.f393a00_2__windmill* %a, %struct.f393a00_2__windmill* %b) { +entry: + %t = alloca %struct.f393a00_2__windmill ; <%struct.f393a00_2__windmill*> [#uses=1] + %0 = getelementptr %struct.f393a00_2__windmill, %struct.f393a00_2__windmill* %t, i32 0, i32 0, i32 0 ; <%struct.ada__tags__dispatch_table**> [#uses=1] + %1 = load %struct.ada__tags__dispatch_table*, %struct.ada__tags__dispatch_table** null, align 4 ; <%struct.ada__tags__dispatch_table*> [#uses=1] + %2 = load %struct.ada__tags__dispatch_table*, %struct.ada__tags__dispatch_table** %0, align 8 ; <%struct.ada__tags__dispatch_table*> [#uses=1] + store %struct.ada__tags__dispatch_table* %2, %struct.ada__tags__dispatch_table** null, align 4 + store %struct.ada__tags__dispatch_table* %1, %struct.ada__tags__dispatch_table** null, align 4 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/cs-cs-aliasing.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/cs-cs-aliasing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/cs-cs-aliasing.ll @@ -0,0 +1,74 @@ +; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.basic_string = type { %"class.__gnu_cxx::__versa_string" } +%"class.__gnu_cxx::__versa_string" = type { %"class.__gnu_cxx::__sso_string_base" } +%"class.__gnu_cxx::__sso_string_base" = type { %"struct.__gnu_cxx::__vstring_utility, std::allocator >::_Alloc_hider", i64, %union.anon } +%"struct.__gnu_cxx::__vstring_utility, std::allocator >::_Alloc_hider" = type { i8* } +%union.anon = type { i64, [8 x i8] } + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #0 + +; Function Attrs: noinline nounwind readonly uwtable +declare zeroext i1 @callee_takes_string(%class.basic_string* nonnull) #1 align 2 + +; Function Attrs: nounwind uwtable +define weak_odr zeroext i1 @test() #2 align 2 { + +; CHECK-LABEL: @test + +bb: + %tmp = alloca %class.basic_string, align 8 + %tmp1 = alloca %class.basic_string, align 8 + %tmp3 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 2 + %tmp4 = bitcast %union.anon* %tmp3 to i8* + %tmp5 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 0, i32 0 + %tmp6 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 1 + %tmp7 = getelementptr inbounds i8, i8* %tmp4, i64 1 + %tmp8 = bitcast %class.basic_string* %tmp to i8* + %tmp9 = bitcast i64 0 to i64 + %tmp10 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 2 + %tmp11 = bitcast %union.anon* %tmp10 to i8* + %tmp12 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 0, i32 0 + %tmp13 = getelementptr inbounds %class.basic_string, %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 1 + %tmp14 = getelementptr inbounds i8, i8* %tmp11, i64 1 + %tmp15 = bitcast %class.basic_string* %tmp1 to i8* + br label %_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit + +_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit: ; preds = %bb + store i8* %tmp4, i8** %tmp5, align 8 + store i8 62, i8* %tmp4, align 8 + store i64 1, i64* %tmp6, align 8 + store i8 0, i8* %tmp7, align 1 + %tmp16 = call zeroext i1 @callee_takes_string(%class.basic_string* nonnull %tmp) + br label %_ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3 + +_ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3: ; preds = %_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit + +; CHECK: _ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3: + +; The following can be read through the call %tmp17: + store i8* %tmp11, i8** %tmp12, align 8 + store i8 125, i8* %tmp11, align 8 + store i64 1, i64* %tmp13, align 8 + store i8 0, i8* %tmp14, align 1 + +; CHECK: store i8* %tmp11, i8** %tmp12, align 8 +; CHECK: store i8 125, i8* %tmp11, align 8 +; CHECK: store i64 1, i64* %tmp13, align 8 +; CHECK: store i8 0, i8* %tmp14, align 1 + + %tmp17 = call zeroext i1 @callee_takes_string(%class.basic_string* nonnull %tmp1) + call void @llvm.memset.p0i8.i64(i8* align 8 %tmp11, i8 -51, i64 16, i1 false) #0 + call void @llvm.memset.p0i8.i64(i8* align 8 %tmp15, i8 -51, i64 32, i1 false) #0 + call void @llvm.memset.p0i8.i64(i8* align 8 %tmp4, i8 -51, i64 16, i1 false) #0 + call void @llvm.memset.p0i8.i64(i8* align 8 %tmp8, i8 -51, i64 32, i1 false) #0 + ret i1 %tmp17 +} + +attributes #0 = { nounwind } +attributes #1 = { noinline nounwind readonly uwtable } +attributes #2 = { nounwind uwtable } + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debuginfo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debuginfo.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debuginfo.ll @@ -0,0 +1,32 @@ +; XFAIL: * +; RUN: opt < %s -debugify -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +declare noalias i8* @malloc(i32) + +declare void @test_f() + +define i32* @test_salvage(i32 %arg) { +; Check that all four original local variables have their values preserved. +; CHECK-LABEL: @test_salvage( +; CHECK-NEXT: malloc +; CHECK-NEXT: @llvm.dbg.value(metadata i8* %p, metadata ![[p:.*]], metadata !DIExpression()) +; CHECK-NEXT: bitcast +; CHECK-NEXT: @llvm.dbg.value(metadata i32* %P, metadata ![[P:.*]], metadata !DIExpression()) +; CHECK-NEXT: @llvm.dbg.value(metadata i32 %arg, metadata ![[DEAD:.*]], metadata !DIExpression(DW_OP_plus_uconst, 1, DW_OP_stack_value)) +; CHECK-NEXT: call void @test_f() +; CHECK-NEXT: store i32 0, i32* %P + + %p = tail call i8* @malloc(i32 4) + %P = bitcast i8* %p to i32* + %DEAD = add i32 %arg, 1 + store i32 %DEAD, i32* %P + call void @test_f() + store i32 0, i32* %P + ret i32* %P +} + +; CHECK: ![[p]] = !DILocalVariable(name: "1" +; CHECK: ![[P]] = !DILocalVariable(name: "2" +; CHECK: ![[DEAD]] = !DILocalVariable(name: "3" diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/dominate.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/dominate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/dominate.ll @@ -0,0 +1,25 @@ +; RUN: opt -dse -enable-dse-memoryssa -disable-output < %s +; test that we don't crash +declare void @bar() + +define void @foo() { +bb1: + %memtmp3.i = alloca [21 x i8], align 1 + %0 = getelementptr inbounds [21 x i8], [21 x i8]* %memtmp3.i, i64 0, i64 0 + br label %bb3 + +bb2: + call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) + br label %bb3 + +bb3: + call void @bar() + call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) + br label %bb4 + +bb4: + ret void + +} + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll @@ -0,0 +1,96 @@ +; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s + +; We conservative choose to prevent dead store elimination +; across release or stronger fences. It's not required +; (since the must still be a race on %addd.i), but +; it is conservatively correct. A legal optimization +; could hoist the second store above the fence, and then +; DSE one of them. +define void @test1(i32* %addr.i) { +; CHECK-LABEL: @test1 +; CHECK: store i32 5 +; CHECK: fence +; CHECK: store i32 5 +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence release + store i32 5, i32* %addr.i, align 4 + ret void +} + +; Same as previous, but with different values. If we ever optimize +; this more aggressively, this allows us to check that the correct +; store is retained (the 'i32 1' store in this case) +define void @test1b(i32* %addr.i) { +; CHECK-LABEL: @test1b +; CHECK: store i32 42 +; CHECK: fence release +; CHECK: store i32 1 +; CHECK: ret + store i32 42, i32* %addr.i, align 4 + fence release + store i32 1, i32* %addr.i, align 4 + ret void +} + +; We *could* DSE across this fence, but don't. No other thread can +; observe the order of the acquire fence and the store. +define void @test2(i32* %addr.i) { +; CHECK-LABEL: @test2 +; CHECK: store +; CHECK: fence +; CHECK: store +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence acquire + store i32 5, i32* %addr.i, align 4 + ret void +} + +; We DSE stack alloc'ed and byval locations, in the presence of fences. +; Fence does not make an otherwise thread local store visible. +; Right now the DSE in presence of fence is only done in end blocks (with no successors), +; but the same logic applies to other basic blocks as well. +; The store to %addr.i can be removed since it is a byval attribute +define void @test3(i32* byval %addr.i) { +; CHECK-LABEL: @test3 +; CHECK-NOT: store +; CHECK: fence +; CHECK: ret + store i32 5, i32* %addr.i, align 4 + fence release + ret void +} + +declare void @foo(i8* nocapture %p) + +declare noalias i8* @malloc(i32) + +; DSE of stores in locations allocated through library calls. +define void @test_nocapture() { +; CHECK-LABEL: @test_nocapture +; CHECK: malloc +; CHECK: foo +; CHECK-NOT: store +; CHECK: fence + %m = call i8* @malloc(i32 24) + call void @foo(i8* %m) + store i8 4, i8* %m + fence release + ret void +} + + +; This is a full fence, but it does not make a thread local store visible. +; We can DSE the store in presence of the fence. +define void @fence_seq_cst() { +; CHECK-LABEL: @fence_seq_cst +; CHECK-NEXT: fence seq_cst +; CHECK-NEXT: ret void + %P1 = alloca i32 + store i32 0, i32* %P1, align 4 + fence seq_cst + store i32 4, i32* %P1, align 4 + ret void +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll @@ -0,0 +1,71 @@ +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +target datalayout = "e-p:64:64:64" + +declare void @free(i8* nocapture) +declare noalias i8* @malloc(i64) + +; CHECK-LABEL: @test( +; CHECK-NEXT: bitcast +; CHECK-NEXT: @free +; CHECK-NEXT: ret void +define void @test(i32* %Q, i32* %P) { + %DEAD = load i32, i32* %Q ; [#uses=1] + store i32 %DEAD, i32* %P + %1 = bitcast i32* %P to i8* + tail call void @free(i8* %1) nounwind + ret void +} + +; CHECK-LABEL: @test2( +; CHECK-NEXT: bitcast +; CHECK-NEXT: @free +; CHECK-NEXT: ret void +define void @test2({i32, i32}* %P) { + %Q = getelementptr {i32, i32}, {i32, i32} *%P, i32 0, i32 1 + store i32 4, i32* %Q + %1 = bitcast {i32, i32}* %P to i8* + tail call void @free(i8* %1) nounwind + ret void +} + +; CHECK-LABEL: @test3( +; CHECK-NOT: store +; CHECK: ret void +define void @test3() { + %m = call i8* @malloc(i64 24) + store i8 0, i8* %m + %m1 = getelementptr i8, i8* %m, i64 1 + store i8 1, i8* %m1 + call void @free(i8* %m) nounwind + ret void +} + +; PR11240 +; CHECK-LABEL: @test4( +; CHECK-NOT: store +; CHECK: ret void +define void @test4(i1 %x) nounwind { +entry: + %alloc1 = tail call noalias i8* @malloc(i64 4) nounwind + br i1 %x, label %skipinit1, label %init1 + +init1: + store i8 1, i8* %alloc1 + br label %skipinit1 + +skipinit1: + tail call void @free(i8* %alloc1) nounwind + ret void +} + +; CHECK-LABEL: @test5( +define void @test5() { + br label %bb + +bb: + tail call void @free(i8* undef) nounwind + br label %bb +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll @@ -0,0 +1,262 @@ +; XFAIL: * +; RUN: opt -S -dse -enable-dse-memoryssa < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; If there are two stores to the same location, DSE should be able to remove +; the first store if the two stores are separated by no more than 98 +; instructions. The existence of debug intrinsics between the stores should +; not affect this instruction limit. + +@x = global i32 0, align 4 + +; Function Attrs: nounwind +define i32 @test_within_limit() !dbg !4 { +entry: + ; The first store; later there is a second store to the same location, + ; so this store should be optimized away by DSE. + ; CHECK-NOT: store i32 1, i32* @x, align 4 + store i32 1, i32* @x, align 4 + + ; Insert 98 dummy instructions between the two stores + %0 = bitcast i32 0 to i32 + %1 = bitcast i32 0 to i32 + %2 = bitcast i32 0 to i32 + %3 = bitcast i32 0 to i32 + %4 = bitcast i32 0 to i32 + %5 = bitcast i32 0 to i32 + %6 = bitcast i32 0 to i32 + %7 = bitcast i32 0 to i32 + %8 = bitcast i32 0 to i32 + %9 = bitcast i32 0 to i32 + %10 = bitcast i32 0 to i32 + %11 = bitcast i32 0 to i32 + %12 = bitcast i32 0 to i32 + %13 = bitcast i32 0 to i32 + %14 = bitcast i32 0 to i32 + %15 = bitcast i32 0 to i32 + %16 = bitcast i32 0 to i32 + %17 = bitcast i32 0 to i32 + %18 = bitcast i32 0 to i32 + %19 = bitcast i32 0 to i32 + %20 = bitcast i32 0 to i32 + %21 = bitcast i32 0 to i32 + %22 = bitcast i32 0 to i32 + %23 = bitcast i32 0 to i32 + %24 = bitcast i32 0 to i32 + %25 = bitcast i32 0 to i32 + %26 = bitcast i32 0 to i32 + %27 = bitcast i32 0 to i32 + %28 = bitcast i32 0 to i32 + %29 = bitcast i32 0 to i32 + %30 = bitcast i32 0 to i32 + %31 = bitcast i32 0 to i32 + %32 = bitcast i32 0 to i32 + %33 = bitcast i32 0 to i32 + %34 = bitcast i32 0 to i32 + %35 = bitcast i32 0 to i32 + %36 = bitcast i32 0 to i32 + %37 = bitcast i32 0 to i32 + %38 = bitcast i32 0 to i32 + %39 = bitcast i32 0 to i32 + %40 = bitcast i32 0 to i32 + %41 = bitcast i32 0 to i32 + %42 = bitcast i32 0 to i32 + %43 = bitcast i32 0 to i32 + %44 = bitcast i32 0 to i32 + %45 = bitcast i32 0 to i32 + %46 = bitcast i32 0 to i32 + %47 = bitcast i32 0 to i32 + %48 = bitcast i32 0 to i32 + %49 = bitcast i32 0 to i32 + %50 = bitcast i32 0 to i32 + %51 = bitcast i32 0 to i32 + %52 = bitcast i32 0 to i32 + %53 = bitcast i32 0 to i32 + %54 = bitcast i32 0 to i32 + %55 = bitcast i32 0 to i32 + %56 = bitcast i32 0 to i32 + %57 = bitcast i32 0 to i32 + %58 = bitcast i32 0 to i32 + %59 = bitcast i32 0 to i32 + %60 = bitcast i32 0 to i32 + %61 = bitcast i32 0 to i32 + %62 = bitcast i32 0 to i32 + %63 = bitcast i32 0 to i32 + %64 = bitcast i32 0 to i32 + %65 = bitcast i32 0 to i32 + %66 = bitcast i32 0 to i32 + %67 = bitcast i32 0 to i32 + %68 = bitcast i32 0 to i32 + %69 = bitcast i32 0 to i32 + %70 = bitcast i32 0 to i32 + %71 = bitcast i32 0 to i32 + %72 = bitcast i32 0 to i32 + %73 = bitcast i32 0 to i32 + %74 = bitcast i32 0 to i32 + %75 = bitcast i32 0 to i32 + %76 = bitcast i32 0 to i32 + %77 = bitcast i32 0 to i32 + %78 = bitcast i32 0 to i32 + %79 = bitcast i32 0 to i32 + %80 = bitcast i32 0 to i32 + %81 = bitcast i32 0 to i32 + %82 = bitcast i32 0 to i32 + %83 = bitcast i32 0 to i32 + %84 = bitcast i32 0 to i32 + %85 = bitcast i32 0 to i32 + %86 = bitcast i32 0 to i32 + %87 = bitcast i32 0 to i32 + %88 = bitcast i32 0 to i32 + %89 = bitcast i32 0 to i32 + %90 = bitcast i32 0 to i32 + %91 = bitcast i32 0 to i32 + %92 = bitcast i32 0 to i32 + %93 = bitcast i32 0 to i32 + %94 = bitcast i32 0 to i32 + %95 = bitcast i32 0 to i32 + %96 = bitcast i32 0 to i32 + %97 = bitcast i32 0 to i32 + + ; Insert a meaningless dbg.value intrinsic; it should have no + ; effect on the working of DSE in any way. + call void @llvm.dbg.value(metadata i32 undef, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !4) + + ; CHECK: store i32 -1, i32* @x, align 4 + store i32 -1, i32* @x, align 4 + ret i32 0 +} + +; Function Attrs: nounwind +define i32 @test_outside_limit() { +entry: + ; The first store; later there is a second store to the same location + ; CHECK: store i32 1, i32* @x, align 4 + store i32 1, i32* @x, align 4 + + ; Insert 99 dummy instructions between the two stores; this is + ; one too many instruction for the DSE to take place. + %0 = bitcast i32 0 to i32 + %1 = bitcast i32 0 to i32 + %2 = bitcast i32 0 to i32 + %3 = bitcast i32 0 to i32 + %4 = bitcast i32 0 to i32 + %5 = bitcast i32 0 to i32 + %6 = bitcast i32 0 to i32 + %7 = bitcast i32 0 to i32 + %8 = bitcast i32 0 to i32 + %9 = bitcast i32 0 to i32 + %10 = bitcast i32 0 to i32 + %11 = bitcast i32 0 to i32 + %12 = bitcast i32 0 to i32 + %13 = bitcast i32 0 to i32 + %14 = bitcast i32 0 to i32 + %15 = bitcast i32 0 to i32 + %16 = bitcast i32 0 to i32 + %17 = bitcast i32 0 to i32 + %18 = bitcast i32 0 to i32 + %19 = bitcast i32 0 to i32 + %20 = bitcast i32 0 to i32 + %21 = bitcast i32 0 to i32 + %22 = bitcast i32 0 to i32 + %23 = bitcast i32 0 to i32 + %24 = bitcast i32 0 to i32 + %25 = bitcast i32 0 to i32 + %26 = bitcast i32 0 to i32 + %27 = bitcast i32 0 to i32 + %28 = bitcast i32 0 to i32 + %29 = bitcast i32 0 to i32 + %30 = bitcast i32 0 to i32 + %31 = bitcast i32 0 to i32 + %32 = bitcast i32 0 to i32 + %33 = bitcast i32 0 to i32 + %34 = bitcast i32 0 to i32 + %35 = bitcast i32 0 to i32 + %36 = bitcast i32 0 to i32 + %37 = bitcast i32 0 to i32 + %38 = bitcast i32 0 to i32 + %39 = bitcast i32 0 to i32 + %40 = bitcast i32 0 to i32 + %41 = bitcast i32 0 to i32 + %42 = bitcast i32 0 to i32 + %43 = bitcast i32 0 to i32 + %44 = bitcast i32 0 to i32 + %45 = bitcast i32 0 to i32 + %46 = bitcast i32 0 to i32 + %47 = bitcast i32 0 to i32 + %48 = bitcast i32 0 to i32 + %49 = bitcast i32 0 to i32 + %50 = bitcast i32 0 to i32 + %51 = bitcast i32 0 to i32 + %52 = bitcast i32 0 to i32 + %53 = bitcast i32 0 to i32 + %54 = bitcast i32 0 to i32 + %55 = bitcast i32 0 to i32 + %56 = bitcast i32 0 to i32 + %57 = bitcast i32 0 to i32 + %58 = bitcast i32 0 to i32 + %59 = bitcast i32 0 to i32 + %60 = bitcast i32 0 to i32 + %61 = bitcast i32 0 to i32 + %62 = bitcast i32 0 to i32 + %63 = bitcast i32 0 to i32 + %64 = bitcast i32 0 to i32 + %65 = bitcast i32 0 to i32 + %66 = bitcast i32 0 to i32 + %67 = bitcast i32 0 to i32 + %68 = bitcast i32 0 to i32 + %69 = bitcast i32 0 to i32 + %70 = bitcast i32 0 to i32 + %71 = bitcast i32 0 to i32 + %72 = bitcast i32 0 to i32 + %73 = bitcast i32 0 to i32 + %74 = bitcast i32 0 to i32 + %75 = bitcast i32 0 to i32 + %76 = bitcast i32 0 to i32 + %77 = bitcast i32 0 to i32 + %78 = bitcast i32 0 to i32 + %79 = bitcast i32 0 to i32 + %80 = bitcast i32 0 to i32 + %81 = bitcast i32 0 to i32 + %82 = bitcast i32 0 to i32 + %83 = bitcast i32 0 to i32 + %84 = bitcast i32 0 to i32 + %85 = bitcast i32 0 to i32 + %86 = bitcast i32 0 to i32 + %87 = bitcast i32 0 to i32 + %88 = bitcast i32 0 to i32 + %89 = bitcast i32 0 to i32 + %90 = bitcast i32 0 to i32 + %91 = bitcast i32 0 to i32 + %92 = bitcast i32 0 to i32 + %93 = bitcast i32 0 to i32 + %94 = bitcast i32 0 to i32 + %95 = bitcast i32 0 to i32 + %96 = bitcast i32 0 to i32 + %97 = bitcast i32 0 to i32 + %98 = bitcast i32 0 to i32 + + ; CHECK: store i32 -1, i32* @x, align 4 + store i32 -1, i32* @x, align 4 + ret i32 0 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!11, !13} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "test.c", directory: "/home/tmp") +!2 = !{} +!4 = distinct !DISubprogram(name: "test_within_limit", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 4, file: !1, scope: !5, type: !6, retainedNodes: !2) +!5 = !DIFile(filename: "test.c", directory: "/home/tmp") +!6 = !DISubroutineType(types: !7) +!7 = !{!8} +!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!9 = !{!10} +!10 = !DILocalVariable(name: "x", scope: !4, type: !8) +!11 = !{i32 2, !"Dwarf Version", i32 4} +!12 = !{i32* undef} + +!13 = !{i32 1, !"Debug Info Version", i32 3} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/int_sideeffect.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/int_sideeffect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/int_sideeffect.ll @@ -0,0 +1,15 @@ +; RUN: opt -S < %s -dse -enable-dse-memoryssa | FileCheck %s + +declare void @llvm.sideeffect() + +; Dead store elimination across a @llvm.sideeffect. + +; CHECK-LABEL: dse +; CHECK: store +; CHECK-NOT: store +define void @dse(float* %p) { + store float 0.0, float* %p + call void @llvm.sideeffect() + store float 0.0, float* %p + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/invariant.start.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/invariant.start.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/invariant.start.ll @@ -0,0 +1,34 @@ +; Test to make sure llvm.invariant.start calls are not treated as clobbers. +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly + +; We cannot remove the store 1 to %p. +; FIXME: By the semantics of invariant.start, the store 3 to p is unreachable. +define void @test(i8 *%p) { + store i8 1, i8* %p, align 4 + %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %p) + store i8 3, i8* %p, align 4 + ret void +; CHECK-LABEL: @test( +; CHECK-NEXT: store i8 1, i8* %p, align 4 +; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %p) +; CHECK-NEXT: store i8 3, i8* %p, align 4 +; CHECK-NEXT: ret void +} + +; FIXME: We should be able to remove the first store to p, even though p and q +; may alias. +define void @test2(i8* %p, i8* %q) { + store i8 1, i8* %p, align 4 + store i8 2, i8* %q, align 4 + %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %q) + store i8 3, i8* %p, align 4 + ret void +; CHECK-LABEL: @test2( +; CHECK-NEXT: store i8 1, i8* %p, align 4 +; CHECK-NEXT: store i8 2, i8* %q, align 4 +; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %q) +; CHECK-NEXT: store i8 3, i8* %p, align 4 +; CHECK-NEXT: ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/launder.invariant.group.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/launder.invariant.group.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/launder.invariant.group.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +; CHECK-LABEL: void @skipBarrier(i8* %ptr) +define void @skipBarrier(i8* %ptr) { +; CHECK-NOT: store i8 42 + store i8 42, i8* %ptr +; CHECK: %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK: store i8 43 + store i8 43, i8* %ptr2 + ret void +} + +; CHECK-LABEL: void @skip2Barriers(i8* %ptr) +define void @skip2Barriers(i8* %ptr) { +; CHECK-NOT: store i8 42 + store i8 42, i8* %ptr +; CHECK: %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK-NOT: store i8 43 + store i8 43, i8* %ptr2 + %ptr3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr2) + %ptr4 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr3) + +; CHECK: store i8 44 + store i8 44, i8* %ptr4 + ret void +} + +; CHECK-LABEL: void @skip3Barriers(i8* %ptr) +define void @skip3Barriers(i8* %ptr) { +; CHECK-NOT: store i8 42 + store i8 42, i8* %ptr +; CHECK: %ptr2 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr) +; CHECK-NOT: store i8 43 + store i8 43, i8* %ptr2 + %ptr3 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr2) + %ptr4 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr3) + +; CHECK: store i8 44 + store i8 44, i8* %ptr4 + ret void +} + +; CHECK-LABEL: void @skip4Barriers(i8* %ptr) +define void @skip4Barriers(i8* %ptr) { +; CHECK-NOT: store i8 42 + store i8 42, i8* %ptr +; CHECK: %ptr2 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr) +; CHECK-NOT: store i8 43 + store i8 43, i8* %ptr2 + %ptr3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr2) + %ptr4 = call i8* @llvm.strip.invariant.group.p0i8(i8* %ptr3) + %ptr5 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr3) + +; CHECK: store i8 44 + store i8 44, i8* %ptr5 + ret void +} + + +declare i8* @llvm.launder.invariant.group.p0i8(i8*) +declare i8* @llvm.strip.invariant.group.p0i8(i8*) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/libcalls.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/libcalls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/libcalls.ll @@ -0,0 +1,71 @@ +; XFAIL: * +; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s + +declare i8* @strcpy(i8* %dest, i8* %src) nounwind +define void @test1(i8* %src) { +; CHECK-LABEL: @test1( + %B = alloca [16 x i8] + %dest = getelementptr inbounds [16 x i8], [16 x i8]* %B, i64 0, i64 0 +; CHECK-NOT: @strcpy + %call = call i8* @strcpy(i8* %dest, i8* %src) +; CHECK: ret void + ret void +} + +declare i8* @strncpy(i8* %dest, i8* %src, i32 %n) nounwind +define void @test2(i8* %src) { +; CHECK-LABEL: @test2( + %B = alloca [16 x i8] + %dest = getelementptr inbounds [16 x i8], [16 x i8]* %B, i64 0, i64 0 +; CHECK-NOT: @strncpy + %call = call i8* @strncpy(i8* %dest, i8* %src, i32 12) +; CHECK: ret void + ret void +} + +declare i8* @strcat(i8* %dest, i8* %src) nounwind +define void @test3(i8* %src) { +; CHECK-LABEL: @test3( + %B = alloca [16 x i8] + %dest = getelementptr inbounds [16 x i8], [16 x i8]* %B, i64 0, i64 0 +; CHECK-NOT: @strcat + %call = call i8* @strcat(i8* %dest, i8* %src) +; CHECK: ret void + ret void +} + +declare i8* @strncat(i8* %dest, i8* %src, i32 %n) nounwind +define void @test4(i8* %src) { +; CHECK-LABEL: @test4( + %B = alloca [16 x i8] + %dest = getelementptr inbounds [16 x i8], [16 x i8]* %B, i64 0, i64 0 +; CHECK-NOT: @strncat + %call = call i8* @strncat(i8* %dest, i8* %src, i32 12) +; CHECK: ret void + ret void +} + +define void @test5(i8* nocapture %src) { +; CHECK-LABEL: @test5( + %dest = alloca [100 x i8], align 16 + %arraydecay = getelementptr inbounds [100 x i8], [100 x i8]* %dest, i64 0, i64 0 + %call = call i8* @strcpy(i8* %arraydecay, i8* %src) +; CHECK: %call = call i8* @strcpy + %arrayidx = getelementptr inbounds i8, i8* %call, i64 10 + store i8 97, i8* %arrayidx, align 1 + ret void +} + +declare void @user(i8* %p) +define void @test6(i8* %src) { +; CHECK-LABEL: @test6( + %B = alloca [16 x i8] + %dest = getelementptr inbounds [16 x i8], [16 x i8]* %B, i64 0, i64 0 +; CHECK: @strcpy + %call = call i8* @strcpy(i8* %dest, i8* %src) +; CHECK: @user + call void @user(i8* %dest) +; CHECK: ret void + ret void +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll @@ -0,0 +1,38 @@ +; XFAIL: * +; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s + +target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind +declare void @llvm.memset.p0i8.i8(i8* nocapture, i8, i8, i1) nounwind + +define void @test1() { +; CHECK-LABEL: @test1( + %A = alloca i8 + + store i8 0, i8* %A ;; Written to by memset + call void @llvm.lifetime.end.p0i8(i64 1, i8* %A) +; CHECK: lifetime.end + + call void @llvm.memset.p0i8.i8(i8* %A, i8 0, i8 -1, i1 false) +; CHECK-NOT: memset + + ret void +; CHECK: ret void +} + +define void @test2(i32* %P) { +; CHECK: test2 + %Q = getelementptr i32, i32* %P, i32 1 + %R = bitcast i32* %Q to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %R) +; CHECK: lifetime.start + store i32 0, i32* %Q ;; This store is dead. +; CHECK-NOT: store + call void @llvm.lifetime.end.p0i8(i64 4, i8* %R) +; CHECK: lifetime.end + ret void +} + + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll @@ -0,0 +1,72 @@ +; RUN: opt -S -dse -enable-dse-memoryssa -memdep-block-scan-limit=3 < %s | FileCheck %s +; RUN: opt -S -strip-debug -dse -enable-dse-memoryssa -memdep-block-scan-limit=3 < %s | FileCheck %s + +; Test case to check that the memory dependency analysis gets the same +; result even if we have a dbg value between the memcpy and +; store. The memory dependency is then used by DSE to remove the store. + +; We use -memdep-block-scan-limit=3 to be able to create a small test case. +; Without it, we would need to squeeze in 100 instructions since the default +; limit is 100. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@g = common global [1 x i8] zeroinitializer, align 1, !dbg !0 + +; Function Attrs: noinline nounwind uwtable +define void @foo() #0 !dbg !14 { +entry: + %i = alloca i8, align 1 + store i8 1, i8* %i, align 1, !dbg !19 + call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18 + %0 = bitcast [1 x i8]* @g to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %i, i8* %0, i64 1, i1 false), !dbg !20 + br label %bb2 + +bb2: ; preds = %0 + ret void, !dbg !21 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #2 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable } +attributes #2 = { argmemonly nounwind } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!10, !11, !12} +!llvm.ident = !{!13} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "g", scope: !2, file: !3, line: 3, type: !6, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5) +!3 = !DIFile(filename: "foo.c", directory: "/bar") +!4 = !{} +!5 = !{!0} +!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 8, elements: !8) +!7 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!8 = !{!9} +!9 = !DISubrange(count: 1) +!10 = !{i32 2, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{!"clang version 6.0.0"} +!14 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 5, type: !15, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: false, unit: !2, retainedNodes: !4) +!15 = !DISubroutineType(types: !16) +!16 = !{null} +!17 = !DILocalVariable(name: "i", scope: !14, file: !3, line: 7, type: !7) +!18 = !DILocation(line: 7, column: 10, scope: !14) +!19 = !DILocation(line: 8, column: 7, scope: !14) +!20 = !DILocation(line: 9, column: 5, scope: !14) +!21 = !DILocation(line: 10, column: 1, scope: !14) + +; Check that the store is removed and that the memcpy is still there +; CHECK-LABEL: foo +; CHECK-NOT: store i8 +; CHECK: call void @llvm.memcpy +; CHECK: ret void diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt -S -dse -enable-dse-memoryssa < %s | FileCheck %s + +declare void @llvm.memcpy.p0i8.p0i8.i8(i8* nocapture, i8* nocapture, i8, i1) nounwind +declare void @llvm.memmove.p0i8.p0i8.i8(i8* nocapture, i8* nocapture, i8, i1) nounwind +declare void @llvm.memset.p0i8.i8(i8* nocapture, i8, i8, i1) nounwind + +define void @test1() { +; CHECK-LABEL: @test1( +; CHECK-NEXT: ret void +; + %A = alloca i8 + %B = alloca i8 + + store i8 0, i8* %A ;; Written to by memcpy + + call void @llvm.memcpy.p0i8.p0i8.i8(i8* %A, i8* %B, i8 -1, i1 false) + + ret void +} + +define void @test2() { +; CHECK-LABEL: @test2( +; CHECK-NEXT: ret void +; + %A = alloca i8 + %B = alloca i8 + + store i8 0, i8* %A ;; Written to by memmove + + call void @llvm.memmove.p0i8.p0i8.i8(i8* %A, i8* %B, i8 -1, i1 false) + + ret void +} + +define void @test3() { +; CHECK-LABEL: @test3( +; CHECK-NEXT: ret void +; + %A = alloca i8 + %B = alloca i8 + + store i8 0, i8* %A ;; Written to by memset + + call void @llvm.memset.p0i8.i8(i8* %A, i8 0, i8 -1, i1 false) + + ret void +} + +declare void @llvm.memcpy.element.unordered.atomic.p0i16.p0i16.i16(i16* nocapture, i16* nocapture, i16, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i16.p0i16.i16(i16* nocapture, i16* nocapture, i16, i32) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i16.i16(i16* nocapture, i8, i16, i32) nounwind + + +define void @test4() { +; CHECK-LABEL: @test4( +; CHECK-NEXT: ret void +; + %A = alloca i16, i16 1024, align 2 + %B = alloca i16, i16 1024, align 2 + + store atomic i16 0, i16* %A unordered, align 2 ;; Written to by memcpy + store atomic i16 0, i16* %B unordered, align 2 ;; Read by memcpy + + call void @llvm.memcpy.element.unordered.atomic.p0i16.p0i16.i16(i16* align 2 %A, i16* align 2 %B, i16 1024, i32 2) + + ret void +} + +define void @test5() { +; CHECK-LABEL: @test5( +; CHECK-NEXT: ret void +; + %A = alloca i16, i16 1024, align 2 + %B = alloca i16, i16 1024, align 2 + + store atomic i16 0, i16* %A unordered, align 2 ;; Written to by memmove + store atomic i16 0, i16* %B unordered, align 2 ;; Read by memmove + + call void @llvm.memmove.element.unordered.atomic.p0i16.p0i16.i16(i16* align 2 %A, i16* align 2 %B, i16 1024, i32 2) + + ret void +} + +define void @test6() { +; CHECK-LABEL: @test6( +; CHECK-NEXT: ret void +; + %A = alloca i16, i16 1024, align 2 + %B = alloca i16, i16 1024, align 2 + + store atomic i16 0, i16* %A unordered, align 2 ;; Written to by memset + + call void @llvm.memset.element.unordered.atomic.p0i16.i16(i16* align 2 %A, i8 0, i16 1024, i32 2) + + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll @@ -0,0 +1,90 @@ +; Test that the getelementptr generated when the dse pass determines that +; a memset can be shortened has the debugloc carried over from the memset. + +; XFAIL: * +; RUN: opt -S -march=native -dse -enable-dse-memoryssa < %s| FileCheck %s +; CHECK: bitcast [5 x i64]* %{{[a-zA-Z_][a-zA-Z0-9_]*}} to i8*, !dbg +; CHECK-NEXT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %0, i64 32, !dbg ![[DBG:[0-9]+]] +; CHECK: ![[DBG]] = !DILocation(line: 2, + +; The test IR is generated by running: +; +; clang Debugify_Dead_Store_Elimination.cpp -Wno-c++11-narrowing -S \ +; -emit-llvm -O0 -w -Xclang -disable-O0-optnone -march=native -fdeclspec \ +; --target=x86_64-gnu-linux-unknown -Werror=unreachable-code -o - +; +; Where Debugify_Dead_Store_Elimination.cpp contains: +; +; int a() { +; long b[]{2, 2, 2, 2, 0}; +; if (a()) +; ; +; } + + +define dso_local i32 @_Z1av() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %b = alloca [5 x i64], align 16 + call void @llvm.dbg.declare(metadata [5 x i64]* %b, metadata !11, metadata !DIExpression()), !dbg !16 + %0 = bitcast [5 x i64]* %b to i8*, !dbg !16 + call void @llvm.memset.p0i8.i64(i8* align 16 %0, i8 0, i64 40, i1 false), !dbg !16 + %1 = bitcast i8* %0 to [5 x i64]*, !dbg !16 + %2 = getelementptr inbounds [5 x i64], [5 x i64]* %1, i32 0, i32 0, !dbg !16 + store i64 2, i64* %2, align 16, !dbg !16 + %3 = getelementptr inbounds [5 x i64], [5 x i64]* %1, i32 0, i32 1, !dbg !16 + store i64 2, i64* %3, align 8, !dbg !16 + %4 = getelementptr inbounds [5 x i64], [5 x i64]* %1, i32 0, i32 2, !dbg !16 + store i64 2, i64* %4, align 16, !dbg !16 + %5 = getelementptr inbounds [5 x i64], [5 x i64]* %1, i32 0, i32 3, !dbg !16 + store i64 2, i64* %5, align 8, !dbg !16 + %call = call i32 @_Z1av(), !dbg !17 + %tobool = icmp ne i32 %call, 0, !dbg !17 + br i1 %tobool, label %if.then, label %if.end, !dbg !19 + +if.then: ; preds = %entry + br label %if.end, !dbg !19 + +if.end: ; preds = %if.then, %entry + call void @llvm.trap(), !dbg !20 + unreachable, !dbg !20 + +return: ; No predecessors! + %6 = load i32, i32* %retval, align 4, !dbg !21 + ret i32 %6, !dbg !21 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) + +; Function Attrs: cold noreturn nounwind +declare void @llvm.trap() + +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 9.0.0 (https://github.com/llvm/llvm-project.git eb1a156d7f7ba56ea8f9a26da36e6a93d1e98bda)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "Debugify_Dead_Store_Elimination.cpp", directory: "") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 9.0.0 (https://github.com/llvm/llvm-project.git eb1a156d7f7ba56ea8f9a26da36e6a93d1e98bda)"} +!7 = distinct !DISubprogram(name: "a", linkageName: "_Z1av", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{!10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 2, type: !12) +!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, size: 320, elements: !14) +!13 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +!14 = !{!15} +!15 = !DISubrange(count: 5) +!16 = !DILocation(line: 2, column: 8, scope: !7) +!17 = !DILocation(line: 3, column: 7, scope: !18) +!18 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 7) +!19 = !DILocation(line: 3, column: 7, scope: !7) +!20 = !DILocation(line: 3, column: 9, scope: !18) +!21 = !DILocation(line: 5, column: 1, scope: !7) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt -dse -enable-dse-memoryssa -enable-dse-partial-store-merging -S < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128" + +define void @byte_by_byte_replacement(i32 *%ptr) { +; CHECK-LABEL: @byte_by_byte_replacement( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 151653132, i32* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + ;; This store's value should be modified as it should be better to use one + ;; larger store than several smaller ones. + ;; store will turn into 0x090A0B0C == 151653132 + store i32 305419896, i32* %ptr ; 0x12345678 + %bptr = bitcast i32* %ptr to i8* + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + ;; We should be able to merge these four stores with the i32 above + ; value (and bytes) stored before ; 0x12345678 + store i8 9, i8* %bptr ; 09 + store i8 10, i8* %bptr1 ; 0A + store i8 11, i8* %bptr2 ; 0B + store i8 12, i8* %bptr3 ; 0C + ; 0x090A0B0C + + ret void +} + +define void @word_replacement(i64 *%ptr) { +; CHECK-LABEL: @word_replacement( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 72638273700655232, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 72623859790382856, i64* %ptr ; 0x0102030405060708 + + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %wptr3 = getelementptr inbounds i16, i16* %wptr, i64 3 + + ;; We should be able to merge these two stores with the i64 one above + ; value (and bytes) stored before ; 0x0102030405060708 + store i16 4128, i16* %wptr1 ; 1020 + store i16 28800, i16* %wptr3 ; 7080 + ; 0x0102102005067080 + + ret void +} + + +define void @differently_sized_replacements(i64 *%ptr) { +; CHECK-LABEL: @differently_sized_replacements( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 289077004501059343, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr6 = getelementptr inbounds i8, i8* %bptr, i64 6 + %wptr = bitcast i64* %ptr to i16* + %wptr2 = getelementptr inbounds i16, i16* %wptr, i64 2 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (and bytes) stored before ; 0x08090a0b0c0d0e0f + store i8 7, i8* %bptr6 ; 07 + store i16 1541, i16* %wptr2 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + ; 0x040302010605070f + ret void +} + + +define void @multiple_replacements_to_same_byte(i64 *%ptr) { +; CHECK-LABEL: @multiple_replacements_to_same_byte( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 289077004602248719, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (and bytes) stored before ; 0x08090a0b0c0d0e0f + store i8 7, i8* %bptr3 ; 07 + store i16 1541, i16* %wptr1 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + ; 0x040302010c0d0e0f + ret void +} + +define void @merged_merges(i64 *%ptr) { +; CHECK-LABEL: @merged_merges( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 289081428418563599, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 0x08090a0b0c0d0e0f + store i32 67305985, i32* %dptr ; 04030201 + store i16 1541, i16* %wptr1 ; 0605 + store i8 7, i8* %bptr3 ; 07 + ; 0x040306070c0d0e0f + ret void +} + +define signext i8 @shouldnt_merge_since_theres_a_full_overlap(i64 *%ptr) { +; CHECK-LABEL: @shouldnt_merge_since_theres_a_full_overlap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BPTR:%.*]] = bitcast i64* [[PTR:%.*]] to i8* +; CHECK-NEXT: [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1 +; CHECK-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; CHECK-NEXT: [[DPTR:%.*]] = bitcast i8* [[BPTRM1]] to i32* +; CHECK-NEXT: [[QPTR:%.*]] = bitcast i8* [[BPTR3]] to i64* +; CHECK-NEXT: store i32 1234, i32* [[DPTR]], align 1 +; CHECK-NEXT: store i64 5678, i64* [[QPTR]], align 1 +; CHECK-NEXT: ret i8 0 +; +entry: + + store i64 0, i64* %ptr + + %bptr = bitcast i64* %ptr to i8* + %bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %dptr = bitcast i8* %bptrm1 to i32* + %qptr = bitcast i8* %bptr3 to i64* + + store i32 1234, i32* %dptr, align 1 + store i64 5678, i64* %qptr, align 1 + + ret i8 0 +} + +;; Test case from PR31777 +%union.U = type { i64 } + +define void @foo(%union.U* nocapture %u) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds [[UNION_U:%.*]], %union.U* [[U:%.*]], i64 0, i32 0 +; CHECK-NEXT: store i64 11821949021847552, i64* [[I]], align 8 +; CHECK-NEXT: ret void +; +entry: + %i = getelementptr inbounds %union.U, %union.U* %u, i64 0, i32 0 + store i64 0, i64* %i, align 8 + %s = bitcast %union.U* %u to i16* + store i16 42, i16* %s, align 8 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll @@ -0,0 +1,237 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt -dse -enable-dse-memoryssa -enable-dse-partial-store-merging -S < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" + +define void @byte_by_byte_replacement(i32 *%ptr) { +; CHECK-LABEL: @byte_by_byte_replacement( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 202050057, i32* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + ;; This store's value should be modified as it should be better to use one + ;; larger store than several smaller ones. + ;; store will turn into 0x0C0B0A09 == 202050057 + store i32 305419896, i32* %ptr ; 0x12345678 + %bptr = bitcast i32* %ptr to i8* + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + ;; We should be able to merge these four stores with the i32 above + ; value (and bytes) stored before ; 0x12345678 + store i8 9, i8* %bptr ; 09 + store i8 10, i8* %bptr1 ; 0A + store i8 11, i8* %bptr2 ; 0B + store i8 12, i8* %bptr3 ; 0C + ; 0x0C0B0A09 + ret void +} + +define void @word_replacement(i64 *%ptr) { +; CHECK-LABEL: @word_replacement( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 8106482645252179720, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 72623859790382856, i64* %ptr ; 0x0102030405060708 + + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %wptr3 = getelementptr inbounds i16, i16* %wptr, i64 3 + + ;; We should be able to merge these two stores with the i64 one above + ; value (not bytes) stored before ; 0x0102030405060708 + store i16 4128, i16* %wptr1 ; 1020 + store i16 28800, i16* %wptr3 ; 7080 + ; 0x7080030410200708 + ret void +} + + +define void @differently_sized_replacements(i64 *%ptr) { +; CHECK-LABEL: @differently_sized_replacements( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 578437695752307201, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr6 = getelementptr inbounds i8, i8* %bptr, i64 6 + %wptr = bitcast i64* %ptr to i16* + %wptr2 = getelementptr inbounds i16, i16* %wptr, i64 2 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 0x08090a0b0c0d0e0f + store i8 7, i8* %bptr6 ; 07 + store i16 1541, i16* %wptr2 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + ; 0x0807060504030201 + ret void +} + + +define void @multiple_replacements_to_same_byte(i64 *%ptr) { +; CHECK-LABEL: @multiple_replacements_to_same_byte( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 579005069522043393, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 0x08090a0b0c0d0e0f + store i8 7, i8* %bptr3 ; 07 + store i16 1541, i16* %wptr1 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + ; 0x08090a0b04030201 + ret void +} + +define void @merged_merges(i64 *%ptr) { +; CHECK-LABEL: @merged_merges( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 579005069572506113, i64* [[PTR:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 0x08090a0b0c0d0e0f + store i32 67305985, i32* %dptr ; 04030201 + store i16 1541, i16* %wptr1 ; 0605 + store i8 7, i8* %bptr3 ; 07 + ; 0x08090a0b07050201 + ret void +} + +define signext i8 @shouldnt_merge_since_theres_a_full_overlap(i64 *%ptr) { +; CHECK-LABEL: @shouldnt_merge_since_theres_a_full_overlap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BPTR:%.*]] = bitcast i64* [[PTR:%.*]] to i8* +; CHECK-NEXT: [[BPTRM1:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 -1 +; CHECK-NEXT: [[BPTR3:%.*]] = getelementptr inbounds i8, i8* [[BPTR]], i64 3 +; CHECK-NEXT: [[DPTR:%.*]] = bitcast i8* [[BPTRM1]] to i32* +; CHECK-NEXT: [[QPTR:%.*]] = bitcast i8* [[BPTR3]] to i64* +; CHECK-NEXT: store i32 1234, i32* [[DPTR]], align 1 +; CHECK-NEXT: store i64 5678, i64* [[QPTR]], align 1 +; CHECK-NEXT: ret i8 0 +; +entry: + + ; Also check that alias.scope metadata doesn't get dropped + store i64 0, i64* %ptr, !alias.scope !32 + + %bptr = bitcast i64* %ptr to i8* + %bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %dptr = bitcast i8* %bptrm1 to i32* + %qptr = bitcast i8* %bptr3 to i64* + + store i32 1234, i32* %dptr, align 1 + store i64 5678, i64* %qptr, align 1 + + ret i8 0 +} + +;; Test case from PR31777 +%union.U = type { i64 } + +define void @foo(%union.U* nocapture %u) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds [[UNION_U:%.*]], %union.U* [[U:%.*]], i64 0, i32 0 +; CHECK-NEXT: store i64 42, i64* [[I]], align 8 +; CHECK-NEXT: ret void +; +entry: + %i = getelementptr inbounds %union.U, %union.U* %u, i64 0, i32 0 + store i64 0, i64* %i, align 8, !dbg !22, !tbaa !26, !noalias !30, !nontemporal !29 + %s = bitcast %union.U* %u to i16* + store i16 42, i16* %s, align 8 + ret void +} + +; Don't crash by operating on stale data if we merge (kill) the last 2 stores. + +define void @PR34074(i32* %x, i64* %y) { +; CHECK-LABEL: @PR34074( +; CHECK-NEXT: store i64 42, i64* %y +; CHECK-NEXT: store i32 4, i32* %x +; CHECK-NEXT: ret void +; + store i64 42, i64* %y ; independent store + %xbc = bitcast i32* %x to i8* + store i32 0, i32* %x ; big store of constant + store i8 4, i8* %xbc ; small store with mergeable constant + ret void +} + +; We can't eliminate the last store because P and Q may alias. + +define void @PR36129(i32* %P, i32* %Q) { +; CHECK-LABEL: @PR36129( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]] +; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* +; CHECK-NEXT: store i32 2, i32* [[Q:%.*]] +; CHECK-NEXT: store i8 3, i8* [[P2]] +; CHECK-NEXT: ret void +; + store i32 1, i32* %P + %P2 = bitcast i32* %P to i8* + store i32 2, i32* %Q + store i8 3, i8* %P2 + ret void +} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 306512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "me.cpp", directory: "/compiler-explorer") +!2 = !{} +!7 = distinct !DISubprogram(name: "foo", linkageName: "foo(U*)", scope: !1, file: !1, line: 9, type: !8, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !20) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "U", file: !1, line: 4, size: 64, elements: !12, identifier: "typeinfo name for U") +!12 = !{!13, !17} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !11, file: !1, line: 5, baseType: !14, size: 64) +!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint64_t", file: !15, line: 55, baseType: !16) +!15 = !DIFile(filename: "/usr/include/stdint.h", directory: "/compiler-explorer") +!16 = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!17 = !DIDerivedType(tag: DW_TAG_member, name: "s", scope: !11, file: !1, line: 6, baseType: !18, size: 16) +!18 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint16_t", file: !15, line: 49, baseType: !19) +!19 = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) +!20 = !{!21} +!21 = !DILocalVariable(name: "u", arg: 1, scope: !7, file: !1, line: 9, type: !10) +!22 = !DILocation(line: 10, column: 8, scope: !7) + +!26 = !{!27, !27, i64 0} +!27 = !{!"omnipotent char", !28, i64 0} +!28 = !{!"Simple C++ TBAA"} + +!29 = !{i32 1} + +; Domains and scopes which might alias +!30 = !{!30} +!31 = !{!31, !30} + +!32 = !{!32} +!33 = !{!33, !32} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/no-targetdata.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/no-targetdata.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/no-targetdata.ll @@ -0,0 +1,21 @@ +; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +define void @fn(i8* nocapture %buf) #0 { +entry: + +; We would not eliminate the first memcpy with data layout, and we should not +; eliminate it without data layout. +; CHECK-LABEL: @fn +; CHECK: tail call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK: tail call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK: ret void + + %arrayidx = getelementptr i8, i8* %buf, i64 18 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arrayidx, i8* %buf, i64 18, i1 false) + store i8 1, i8* %arrayidx, align 1 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* %arrayidx, i64 18, i1 false) + ret void +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll @@ -0,0 +1,56 @@ +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +declare noalias i8* @malloc(i64) "malloc-like" + +declare void @foo() +declare void @bar(i8*) + +define void @test() { + %obj = call i8* @malloc(i64 8) + store i8 0, i8* %obj + ; don't remove store. %obj should be treated like it will be read by the @foo. + ; CHECK: store i8 0, i8* %obj + call void @foo() ["deopt" (i8* %obj)] + ret void +} + +define void @test1() { + %obj = call i8* @malloc(i64 8) + store i8 0, i8* %obj + ; CHECK: store i8 0, i8* %obj + call void @bar(i8* nocapture %obj) + ret void +} + +define void @test2() { + %obj = call i8* @malloc(i64 8) + store i8 0, i8* %obj + ; CHECK-NOT: store i8 0, i8* %obj + call void @foo() + ret void +} + +define void @test3() { + ; CHECK-LABEL: @test3( + %s = alloca i64 + ; Verify that this first store is not considered killed by the second one + ; since it could be observed from the deopt continuation. + ; CHECK: store i64 1, i64* %s + store i64 1, i64* %s + call void @foo() [ "deopt"(i64* %s) ] + store i64 0, i64* %s + ret void +} + +declare noalias i8* @calloc(i64, i64) + +define void @test4() { +; CHECK-LABEL: @test4 + %local_obj = call i8* @calloc(i64 1, i64 4) + call void @foo() ["deopt" (i8* %local_obj)] + store i8 0, i8* %local_obj, align 4 + ; CHECK-NOT: store i8 0, i8* %local_obj, align 4 + call void @bar(i8* nocapture %local_obj) + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/pr11390.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/pr11390.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/pr11390.ll @@ -0,0 +1,38 @@ +; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s +; PR11390 +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define fastcc void @cat_domain(i8* nocapture %name, i8* nocapture %domain, i8** +nocapture %s) nounwind uwtable { +entry: + %call = tail call i64 @strlen(i8* %name) nounwind readonly + %call1 = tail call i64 @strlen(i8* %domain) nounwind readonly + %add = add i64 %call, 1 + %add2 = add i64 %add, %call1 + %add3 = add i64 %add2, 1 + %call4 = tail call noalias i8* @malloc(i64 %add3) nounwind + store i8* %call4, i8** %s, align 8 + %tobool = icmp eq i8* %call4, null + br i1 %tobool, label %return, label %if.end + +if.end: ; preds = %entry + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call4, i8* %name, i64 %call, i1 false) + %arrayidx = getelementptr inbounds i8, i8* %call4, i64 %call + store i8 46, i8* %arrayidx, align 1 +; CHECK: store i8 46 + %add.ptr5 = getelementptr inbounds i8, i8* %call4, i64 %add + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %add.ptr5, i8* %domain, i64 %call1, i1 false) + %arrayidx8 = getelementptr inbounds i8, i8* %call4, i64 %add2 + store i8 0, i8* %arrayidx8, align 1 + br label %return + +return: ; preds = %if.end, %entry + ret void +} + +declare i64 @strlen(i8* nocapture) nounwind readonly + +declare noalias i8* @malloc(i64) nounwind + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll @@ -0,0 +1,419 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; XFAIL: * +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s +target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind +declare void @llvm.init.trampoline(i8*, i8*, i8*) + +; PR8576 - Should delete store of 10 even though p/q are may aliases. +define void @test2(i32 *%p, i32 *%q) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: store i32 20, i32* [[Q:%.*]], align 4 +; CHECK-NEXT: store i32 30, i32* [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store i32 10, i32* %p, align 4 + store i32 20, i32* %q, align 4 + store i32 30, i32* %p, align 4 + ret void +} + +define void @test5(i32* %Q) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: [[A:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: ret void +; + %a = load volatile i32, i32* %Q + store i32 %a, i32* %Q + ret void +} + +; Should delete store of 10 even though memset is a may-store to P (P and Q may +; alias). +define void @test6(i32 *%p, i8 *%q) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[Q:%.*]], i8 42, i64 900, i1 false) +; CHECK-NEXT: store i32 30, i32* [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store i32 10, i32* %p, align 4 ;; dead. + call void @llvm.memset.p0i8.i64(i8* %q, i8 42, i64 900, i1 false) + store i32 30, i32* %p, align 4 + ret void +} + +; Should delete store of 10 even though memset is a may-store to P (P and Q may +; alias). +define void @test6_atomic(i32* align 4 %p, i8* align 4 %q) { +; CHECK-LABEL: @test6_atomic( +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[Q:%.*]], i8 42, i64 900, i32 4) +; CHECK-NEXT: store atomic i32 30, i32* [[P:%.*]] unordered, align 4 +; CHECK-NEXT: ret void +; + store atomic i32 10, i32* %p unordered, align 4 ;; dead. + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %q, i8 42, i64 900, i32 4) + store atomic i32 30, i32* %p unordered, align 4 + ret void +} + +; Should delete store of 10 even though memcpy is a may-store to P (P and Q may +; alias). +define void @test7(i32 *%p, i8 *%q, i8* noalias %r) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[Q:%.*]], i8* [[R:%.*]], i64 900, i1 false) +; CHECK-NEXT: store i32 30, i32* [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store i32 10, i32* %p, align 4 ;; dead. + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %q, i8* %r, i64 900, i1 false) + store i32 30, i32* %p, align 4 + ret void +} + +; Should delete store of 10 even though memcpy is a may-store to P (P and Q may +; alias). +define void @test7_atomic(i32* align 4 %p, i8* align 4 %q, i8* noalias align 4 %r) { +; CHECK-LABEL: @test7_atomic( +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[Q:%.*]], i8* align 4 [[R:%.*]], i64 900, i32 4) +; CHECK-NEXT: store atomic i32 30, i32* [[P:%.*]] unordered, align 4 +; CHECK-NEXT: ret void +; + store atomic i32 10, i32* %p unordered, align 4 ;; dead. + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %q, i8* align 4 %r, i64 900, i32 4) + store atomic i32 30, i32* %p unordered, align 4 + ret void +} + +; Do not delete stores that are only partially killed. +define i32 @test8() { +; CHECK-LABEL: @test8( +; CHECK-NEXT: [[V:%.*]] = alloca i32 +; CHECK-NEXT: store i32 1234567, i32* [[V]] +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[V]] +; CHECK-NEXT: ret i32 [[X]] +; + %V = alloca i32 + store i32 1234567, i32* %V + %V2 = bitcast i32* %V to i8* + store i8 0, i8* %V2 + %X = load i32, i32* %V + ret i32 %X + +} + +; Test for byval handling. +%struct.x = type { i32, i32, i32, i32 } +define void @test9(%struct.x* byval %a) nounwind { +; CHECK-LABEL: @test9( +; CHECK-NEXT: ret void +; + %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0 + store i32 1, i32* %tmp2, align 4 + ret void +} + +; Test for inalloca handling. +define void @test9_2(%struct.x* inalloca %a) nounwind { +; CHECK-LABEL: @test9_2( +; CHECK-NEXT: ret void +; + %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0 + store i32 1, i32* %tmp2, align 4 + ret void +} + +; DSE should delete the dead trampoline. +declare void @test11f() +define void @test11() { +; CHECK-LABEL: @test11( +; CHECK-NEXT: ret void +; + %storage = alloca [10 x i8], align 16 ; <[10 x i8]*> [#uses=1] + %cast = getelementptr [10 x i8], [10 x i8]* %storage, i32 0, i32 0 ; [#uses=1] + call void @llvm.init.trampoline( i8* %cast, i8* bitcast (void ()* @test11f to i8*), i8* null ) ; [#uses=1] + ret void +} + +; PR2599 - load -> store to same address. +define void @test12({ i32, i32 }* %x) nounwind { +; CHECK-LABEL: @test12( +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i32 0, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 0, [[TMP8]] +; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP7]], align 4 +; CHECK-NEXT: ret void +; + %tmp4 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 0 + %tmp5 = load i32, i32* %tmp4, align 4 + %tmp7 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1 + %tmp8 = load i32, i32* %tmp7, align 4 + %tmp17 = sub i32 0, %tmp8 + store i32 %tmp5, i32* %tmp4, align 4 + store i32 %tmp17, i32* %tmp7, align 4 + ret void +} + + +; %P doesn't escape, the DEAD instructions should be removed. +declare void @test13f() +define i32* @test13() { +; CHECK-LABEL: @test13( +; CHECK-NEXT: [[PTR:%.*]] = tail call i8* @malloc(i32 4) +; CHECK-NEXT: [[P:%.*]] = bitcast i8* [[PTR]] to i32* +; CHECK-NEXT: call void @test13f() +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: ret i32* [[P]] +; + %ptr = tail call i8* @malloc(i32 4) + %P = bitcast i8* %ptr to i32* + %DEAD = load i32, i32* %P + %DEAD2 = add i32 %DEAD, 1 + store i32 %DEAD2, i32* %P + call void @test13f( ) + store i32 0, i32* %P + ret i32* %P +} + +define i32 addrspace(1)* @test13_addrspacecast() { +; CHECK-LABEL: @test13_addrspacecast( +; CHECK-NEXT: [[P:%.*]] = tail call i8* @malloc(i32 4) +; CHECK-NEXT: [[P_BC:%.*]] = bitcast i8* [[P]] to i32* +; CHECK-NEXT: [[P:%.*]] = addrspacecast i32* [[P_BC]] to i32 addrspace(1)* +; CHECK-NEXT: call void @test13f() +; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[P]] +; CHECK-NEXT: ret i32 addrspace(1)* [[P]] +; + %p = tail call i8* @malloc(i32 4) + %p.bc = bitcast i8* %p to i32* + %P = addrspacecast i32* %p.bc to i32 addrspace(1)* + %DEAD = load i32, i32 addrspace(1)* %P + %DEAD2 = add i32 %DEAD, 1 + store i32 %DEAD2, i32 addrspace(1)* %P + call void @test13f( ) + store i32 0, i32 addrspace(1)* %P + ret i32 addrspace(1)* %P +} + +declare noalias i8* @malloc(i32) +declare noalias i8* @calloc(i32, i32) + +define void @test14(i32* %Q) { +; CHECK-LABEL: @test14( +; CHECK-NEXT: ret void +; + %P = alloca i32 + %DEAD = load i32, i32* %Q + store i32 %DEAD, i32* %P + ret void + +} + +define void @test20() { +; CHECK-LABEL: @test20( +; CHECK-NEXT: ret void +; + %m = call i8* @malloc(i32 24) + store i8 0, i8* %m + ret void +} + +define void @test21() { +; CHECK-LABEL: @test21( +; CHECK-NEXT: ret void +; + %m = call i8* @calloc(i32 9, i32 7) + store i8 0, i8* %m + ret void +} + +define void @test22(i1 %i, i32 %k, i32 %m) nounwind { +; CHECK-LABEL: @test22( +; CHECK-NEXT: ret void +; + %k.addr = alloca i32 + %m.addr = alloca i32 + %k.addr.m.addr = select i1 %i, i32* %k.addr, i32* %m.addr + store i32 0, i32* %k.addr.m.addr, align 4 + ret void +} + +; Make sure same sized store to later element is deleted +define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind { +; CHECK-LABEL: @test24( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A:%.*]], i64 0, i64 0 +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1 +; CHECK-NEXT: store i32 [[C:%.*]], i32* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; + %1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0 + store i32 0, i32* %1, align 4 + %2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1 + store i32 0, i32* %2, align 4 + %3 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0 + store i32 %b, i32* %3, align 4 + %4 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1 + store i32 %c, i32* %4, align 4 + ret void +} + +; Remove redundant store if loaded value is in another block. +define i32 @test26(i1 %c, i32* %p) { +; CHECK-LABEL: @test26( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br i1 %c, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + store i32 %v, i32* %p, align 4 + br label %bb3 +bb3: + ret i32 0 +} + +; Remove redundant store if loaded value is in another block. +define i32 @test27(i1 %c, i32* %p) { +; CHECK-LABEL: @test27( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br i1 %c, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + br label %bb3 +bb3: + store i32 %v, i32* %p, align 4 + ret i32 0 +} + +declare void @unknown_func() + +; Don't remove redundant store because of unknown call. +define i32 @test30(i1 %c, i32* %p, i32 %i) { +; CHECK-LABEL: @test30( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: store i32 [[V]], i32* [[P]], align 4 +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br i1 %c, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + ; Might overwrite value at %p + call void @unknown_func() + br label %bb3 +bb3: + store i32 %v, i32* %p, align 4 + ret i32 0 +} + +; Remove redundant store if loaded value is in another block inside a loop. +define i32 @test31(i1 %c, i32* %p, i32 %i) { +; CHECK-LABEL: @test31( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br label %bb1 +bb1: + store i32 %v, i32* %p, align 4 + br i1 undef, label %bb1, label %bb2 +bb2: + ret i32 0 +} + +; Remove redundant store, which is in the lame loop as the load. +define i32 @test33(i1 %c, i32* %p, i32 %i) { +; CHECK-LABEL: @test33( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB3:%.*]] +; CHECK: bb3: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %bb1 +bb1: + %v = load i32, i32* %p, align 4 + br label %bb2 +bb2: + store i32 %v, i32* %p, align 4 + ; Might read and overwrite value at %p, but doesn't matter. + call void @unknown_func() + br i1 undef, label %bb1, label %bb3 +bb3: + ret i32 0 +} + +define void @test43(i32* %P, i32* noalias %Q) { +; CHECK-LABEL: @test43( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 50331649, i32* [[P:%.*]] +; CHECK-NEXT: store i32 2, i32* [[Q:%.*]] +; CHECK-NEXT: ret void +; +entry: + store i32 1, i32* %P + %P2 = bitcast i32* %P to i8* + store i32 2, i32* %Q + store i8 3, i8* %P2 + ret void +} + +define void @test43a(i32* %P, i32* noalias %Q) { +; CHECK-LABEL: @test43a( +; CHECK-NEXT: entry: +; CHECK-NEXT: store atomic i32 50331649, i32* [[P:%.*]] unordered, align 4 +; CHECK-NEXT: store atomic i32 2, i32* [[Q:%.*]] unordered, align 4 +; CHECK-NEXT: ret void +; +entry: + store atomic i32 1, i32* %P unordered, align 4 + %P2 = bitcast i32* %P to i8* + store atomic i32 2, i32* %Q unordered, align 4 + store atomic i8 3, i8* %P2 unordered, align 4 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll @@ -0,0 +1,603 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s +target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind +declare void @llvm.init.trampoline(i8*, i8*, i8*) + +define void @test1(i32* %Q, i32* %P) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: ret void +; + %DEAD = load i32, i32* %Q + store i32 %DEAD, i32* %P + store i32 0, i32* %P + ret void +} + +; PR8677 +@g = global i32 1 + +define i32 @test3(i32* %g_addr) nounwind { +; CHECK-LABEL: @test3( +; CHECK-NEXT: [[G_VALUE:%.*]] = load i32, i32* [[G_ADDR:%.*]], align 4 +; CHECK-NEXT: store i32 -1, i32* @g, align 4 +; CHECK-NEXT: store i32 [[G_VALUE]], i32* [[G_ADDR]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* @g, align 4 +; CHECK-NEXT: ret i32 [[TMP3]] +; + %g_value = load i32, i32* %g_addr, align 4 + store i32 -1, i32* @g, align 4 + store i32 %g_value, i32* %g_addr, align 4 + %tmp3 = load i32, i32* @g, align 4 + ret i32 %tmp3 +} + + +define void @test4(i32* %Q) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[A:%.*]] = load i32, i32* [[Q:%.*]] +; CHECK-NEXT: store volatile i32 [[A]], i32* [[Q]] +; CHECK-NEXT: ret void +; + %a = load i32, i32* %Q + store volatile i32 %a, i32* %Q + ret void +} + +; va_arg has fuzzy dependence, the store shouldn't be zapped. +define double @test10(i8* %X) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: [[X_ADDR:%.*]] = alloca i8* +; CHECK-NEXT: store i8* [[X:%.*]], i8** [[X_ADDR]] +; CHECK-NEXT: [[TMP_0:%.*]] = va_arg i8** [[X_ADDR]], double +; CHECK-NEXT: ret double [[TMP_0]] +; + %X_addr = alloca i8* + store i8* %X, i8** %X_addr + %tmp.0 = va_arg i8** %X_addr, double + ret double %tmp.0 +} + + +declare noalias i8* @malloc(i32) +declare noalias i8* @calloc(i32, i32) + + +; PR8701 + +;; Fully dead overwrite of memcpy. +define void @test15(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test15( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + ret void +} + +;; Fully dead overwrite of memcpy. +define void @test15_atomic(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test15_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Fully dead overwrite of memcpy. +define void @test15_atomic_weaker(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test15_atomic_weaker( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Fully dead overwrite of memcpy. +define void @test15_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test15_atomic_weaker_2( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false) + ret void +} + +;; Full overwrite of smaller memcpy. +define void @test16(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test16( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + ret void +} + +;; Full overwrite of smaller memcpy. +define void @test16_atomic(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test16_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Full overwrite of smaller memory where overwrite has stronger atomicity +define void @test16_atomic_weaker(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test16_atomic_weaker( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i1 false) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Full overwrite of smaller memory where overwrite has weaker atomicity. +define void @test16_atomic_weaker_2(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test16_atomic_weaker_2( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 8, i32 1) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false) + ret void +} + +;; Overwrite of memset by memcpy. +define void @test17(i8* %P, i8* noalias %Q) nounwind ssp { +; CHECK-LABEL: @test17( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + ret void +} + +;; Overwrite of memset by memcpy. +define void @test17_atomic(i8* %P, i8* noalias %Q) nounwind ssp { +; CHECK-LABEL: @test17_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Overwrite of memset by memcpy. Overwrite is stronger atomicity. We can +;; remove the memset. +define void @test17_atomic_weaker(i8* %P, i8* noalias %Q) nounwind ssp { +; CHECK-LABEL: @test17_atomic_weaker( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memset.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i1 false) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +;; Overwrite of memset by memcpy. Overwrite is weaker atomicity. We can remove +;; the memset. +define void @test17_atomic_weaker_2(i8* %P, i8* noalias %Q) nounwind ssp { +; CHECK-LABEL: @test17_atomic_weaker_2( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %P, i8 42, i64 8, i32 1) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i1 false) + ret void +} + +; Should not delete the volatile memset. +define void @test17v(i8* %P, i8* %Q) nounwind ssp { +; CHECK-LABEL: @test17v( +; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true) +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + ret void +} + +; PR8728 +; Do not delete instruction where possible situation is: +; A = B +; A = A +; +; NB! See PR11763 - currently LLVM allows memcpy's source and destination to be +; equal (but not inequal and overlapping). +define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp { +; CHECK-LABEL: @test18( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false) + ret void +} + +define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp { +; CHECK-LABEL: @test18_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1) + ret void +} + + +; The store here is not dead because the byval call reads it. +declare void @test19f({i32}* byval align 4 %P) + +define void @test19({i32} * nocapture byval align 4 %arg5) nounwind ssp { +; CHECK-LABEL: @test19( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i32 }, { i32 }* [[ARG5:%.*]], i32 0, i32 0 +; CHECK-NEXT: store i32 912, i32* [[TMP7]] +; CHECK-NEXT: call void @test19f({ i32 }* byval align 4 [[ARG5]]) +; CHECK-NEXT: ret void +; +bb: + %tmp7 = getelementptr inbounds {i32}, {i32}* %arg5, i32 0, i32 0 + store i32 912, i32* %tmp7 + call void @test19f({i32}* byval align 4 %arg5) + ret void + +} + +; PR13547 +declare noalias i8* @strdup(i8* nocapture) nounwind +define noalias i8* @test23() nounwind uwtable ssp { +; CHECK-LABEL: @test23( +; CHECK-NEXT: [[X:%.*]] = alloca [2 x i8], align 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8], [2 x i8]* [[X]], i64 0, i64 0 +; CHECK-NEXT: store i8 97, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8], [2 x i8]* [[X]], i64 0, i64 1 +; CHECK-NEXT: store i8 0, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CALL:%.*]] = call i8* @strdup(i8* [[ARRAYIDX]]) #2 +; CHECK-NEXT: ret i8* [[CALL]] +; + %x = alloca [2 x i8], align 1 + %arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %x, i64 0, i64 0 + store i8 97, i8* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds [2 x i8], [2 x i8]* %x, i64 0, i64 1 + store i8 0, i8* %arrayidx1, align 1 + %call = call i8* @strdup(i8* %arrayidx) nounwind + ret i8* %call +} + +; Check another case like PR13547 where strdup is not like malloc. +define i8* @test25(i8* %p) nounwind { +; CHECK-LABEL: @test25( +; CHECK-NEXT: [[P_4:%.*]] = getelementptr i8, i8* [[P:%.*]], i64 4 +; CHECK-NEXT: [[TMP:%.*]] = load i8, i8* [[P_4]], align 1 +; CHECK-NEXT: store i8 0, i8* [[P_4]], align 1 +; CHECK-NEXT: [[Q:%.*]] = call i8* @strdup(i8* [[P]]) #5 +; CHECK-NEXT: store i8 [[TMP]], i8* [[P_4]], align 1 +; CHECK-NEXT: ret i8* [[Q]] +; + %p.4 = getelementptr i8, i8* %p, i64 4 + %tmp = load i8, i8* %p.4, align 1 + store i8 0, i8* %p.4, align 1 + %q = call i8* @strdup(i8* %p) nounwind optsize + store i8 %tmp, i8* %p.4, align 1 + ret i8* %q +} + +; Don't remove redundant store because of may-aliased store. +define i32 @test28(i1 %c, i32* %p, i32* %p2, i32 %i) { +; CHECK-LABEL: @test28( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: store i32 [[I:%.*]], i32* [[P2:%.*]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: store i32 [[V]], i32* [[P]], align 4 +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + + ; Might overwrite value at %p + store i32 %i, i32* %p2, align 4 + br i1 %c, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + br label %bb3 +bb3: + store i32 %v, i32* %p, align 4 + ret i32 0 +} + +; Don't remove redundant store because of may-aliased store. +define i32 @test29(i1 %c, i32* %p, i32* %p2, i32 %i) { +; CHECK-LABEL: @test29( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 [[I:%.*]], i32* [[P2:%.*]], align 4 +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: store i32 [[V]], i32* [[P]], align 4 +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br i1 %c, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + ; Might overwrite value at %p + store i32 %i, i32* %p2, align 4 + br label %bb3 +bb3: + store i32 %v, i32* %p, align 4 + ret i32 0 +} + +declare void @unknown_func() + +; Don't remove redundant store in a loop with a may-alias store. +define i32 @test32(i1 %c, i32* %p, i32 %i) { +; CHECK-LABEL: @test32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 [[V]], i32* [[P]], align 4 +; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret i32 0 +; +entry: + %v = load i32, i32* %p, align 4 + br label %bb1 +bb1: + store i32 %v, i32* %p, align 4 + ; Might read and overwrite value at %p + call void @unknown_func() + br i1 undef, label %bb1, label %bb2 +bb2: + ret i32 0 +} + +; TODO +; We can remove redundant store, as noalias %p guarantees that the function does +; only access it via %p. This also holds for the call to unknown_func even though +; it could unwind +define void @test34(i32* noalias %p) { +; CHECK-LABEL: @test34( +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: ret void +; + store i32 1, i32* %p + call void @unknown_func() + store i32 0, i32* %p + ret void +} + +; TODO +; Remove redundant store even with an unwinding function in the same block +define void @test35(i32* noalias %p) { +; CHECK-LABEL: @test35( +; CHECK-NEXT: call void @unknown_func() +; CHECK-NEXT: store i32 1, i32* [[P:%.*]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: ret void +; + call void @unknown_func() + store i32 1, i32* %p + store i32 0, i32* %p + ret void +} + +; We cannot optimize away the first memmove since %P could overlap with %Q. +define void @test36(i8* %P, i8* %Q) { +; CHECK-LABEL: @test36( +; CHECK-NEXT: tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P]], i8* [[Q]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + ret void +} + +define void @test36_atomic(i8* %P, i8* %Q) { +; CHECK-LABEL: @test36_atomic( +; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[Q]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + ret void +} + +define void @test37(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test37( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false) + ret void +} + +define void @test37_atomic(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test37_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1) + ret void +} + +; Same caveat about memcpy as in @test18 applies here. +define void @test38(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test38( +; CHECK-NEXT: tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false) + ret void +} + +define void @test38_atomic(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test38_atomic( +; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 12, i32 1) + ret void +} + +define void @test39(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test39( +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false) +; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 8, i1 false) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 8, i1 false) + ret void +} + +define void @test39_atomic(i8* %P, i8* %Q, i8* %R) { +; CHECK-LABEL: @test39_atomic( +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 8, i32 1) +; CHECK-NEXT: ret void +; + + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1) + tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %R, i64 8, i32 1) + ret void +} + +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32) + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind +define void @test40(i32** noalias %Pp, i32* noalias %Q) { +; CHECK-LABEL: @test40( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[AC:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[AC]]) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32** [[PP:%.*]] to i8** +; CHECK-NEXT: [[PC:%.*]] = load i8*, i8** [[TMP0]], align 8 +; CHECK-NEXT: [[QC:%.*]] = bitcast i32* [[Q:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 [[AC]], i8* align 4 [[QC]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[PC]], i8* nonnull align 4 [[AC]], i64 4, i1 true) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[AC]]) +; CHECK-NEXT: ret void +; +entry: + %A = alloca i32, align 4 + %Ac = bitcast i32* %A to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %Ac) + %0 = bitcast i32** %Pp to i8** + %Pc = load i8*, i8** %0, align 8 + %Qc = bitcast i32* %Q to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %Ac, i8* align 4 %Qc, i64 4, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %Pc, i8* nonnull align 4 %Ac, i64 4, i1 true) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %Ac) + ret void +} + +; I think this case is currently handled incorrectly by memdeps dse +; throwing should leave store i32 1, not remove from the free. +declare void @free(i8* nocapture) +define void @test41(i32* noalias %P) { +; NOCHECK-LABEL: @test41( +; NOCHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8* +; NOCHECK-NEXT: store i32 1, i32* [[P]] +; NOCHECK-NEXT: call void @unknown_func() +; NOCHECK-NEXT: call void @free(i8* [[P2]]) +; NOCHECK-NEXT: ret void +; + %P2 = bitcast i32* %P to i8* + store i32 1, i32* %P + call void @unknown_func() + store i32 2, i32* %P + call void @free(i8* %P2) + ret void +} + +define void @test42(i32* %P, i32* %Q) { +; NOCHECK-LABEL: @test42( +; NOCHECK-NEXT: store i32 1, i32* [[P:%.*]] +; NOCHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* +; NOCHECK-NEXT: store i32 2, i32* [[Q:%.*]] +; NOCHECK-NEXT: store i8 3, i8* [[P2]] +; NOCHECK-NEXT: ret void +; + store i32 1, i32* %P + %P2 = bitcast i32* %P to i8* + store i32 2, i32* %Q + store i8 3, i8* %P2 + ret void +} + +define void @test42a(i32* %P, i32* %Q) { +; NOCHECK-LABEL: @test42a( +; NOCHECK-NEXT: store atomic i32 1, i32* [[P:%.*]] unordered, align 4 +; NOCHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* +; NOCHECK-NEXT: store atomic i32 2, i32* [[Q:%.*]] unordered, align 4 +; NOCHECK-NEXT: store atomic i8 3, i8* [[P2]] unordered, align 4 +; NOCHECK-NEXT: ret void +; + store atomic i32 1, i32* %P unordered, align 4 + %P2 = bitcast i32* %P to i8* + store atomic i32 2, i32* %Q unordered, align 4 + store atomic i8 3, i8* %P2 unordered, align 4 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/tail-byval.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/tail-byval.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/tail-byval.ll @@ -0,0 +1,23 @@ +; RUN: opt -dse -enable-dse-memoryssa -S < %s | FileCheck %s + +; Don't eliminate stores to allocas before tail calls to functions that use +; byval. It's correct to mark calls like these as 'tail'. To implement this tail +; call, the backend should copy the bytes from the alloca into the argument area +; before clearing the stack. + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +declare void @g(i32* byval %p) + +define void @f(i32* byval %x) { +entry: + %p = alloca i32 + %v = load i32, i32* %x + store i32 %v, i32* %p + tail call void @g(i32* byval %p) + ret void +} +; CHECK-LABEL: define void @f(i32* byval %x) +; CHECK: store i32 %v, i32* %p +; CHECK: tail call void @g(i32* byval %p)