Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -415,6 +415,29 @@ MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + /// \brief Create and insert an element unordered-atomic memset ti the + /// specified pointer and the specified value. + /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is + /// specified, it will be added to the instruction. Likewise with alias.scope + /// and noalias tags. + CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, unsigned PtrAlign, + Value *Val, uint64_t Size, + uint32_t ElementSize, + MDNode *TBAATag = nullptr, + MDNode *ScopeTag = nullptr, + MDNode *NoAliasTag = nullptr) { + return CreateElementUnorderedAtomicMemSet(Ptr, PtrAlign, Val, + getInt64(Size), ElementSize, + TBAATag, ScopeTag, NoAliasTag); + } + + CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, unsigned PtrAlign, + Value *Val, Value *Size, + uint32_t ElementSize, + MDNode *TBAATag = nullptr, + MDNode *ScopeTag = nullptr, + MDNode *NoAliasTag = nullptr); + /// \brief Create and insert a memcpy between the specified pointers. /// /// If the pointers aren't i8*, they will be converted. If a TBAA tag is Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -103,6 +103,37 @@ return CI; } +CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( + Value *Ptr, unsigned PtrAlign, Value *Val, Value *Size, + uint32_t ElementSize, MDNode *TBAATag, MDNode *ScopeTag, + MDNode *NoAliasTag) { + assert(PtrAlign >= ElementSize && + "Specification requires requires alignment of at least element size"); + Ptr = getCastedInt8PtrValue(Ptr); + Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)}; + Type *Tys[] = {Ptr->getType(), Size->getType()}; + Module *M = BB->getParent()->getParent(); + Value *TheFn = Intrinsic::getDeclaration( + M, Intrinsic::memset_element_unordered_atomic, Tys); + + CallInst *CI = createCallHelper(TheFn, Ops, this); + + // Set the alignment of the pointer arg. + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), PtrAlign)); + + // Set the TBAA info if present. + if (TBAATag) + CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); + + if (ScopeTag) + CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); + + if (NoAliasTag) + CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + + return CI; +} + CallInst *IRBuilderBase:: CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -150,6 +150,7 @@ MemsetPattern, Memcpy, UnorderedAtomicMemcpy, + UnorderedAtomicMemset, DontUse // Dummy retval never to be used. Allows catching errors in retval // handling. }; @@ -432,12 +433,15 @@ // If we're allowed to form a memset, and the stored value would be // acceptable for memset, use it. - if (!UnorderedAtomic && HasMemset && SplatValue && + if (HasMemset && SplatValue && // Verify that the stored value is loop invariant. If not, we can't // promote the memset. CurLoop->isLoopInvariant(SplatValue)) { // It looks like we can use SplatValue. - return LegalStoreKind::Memset; + if (UnorderedAtomic) + return LegalStoreKind::UnorderedAtomicMemset; + else + return LegalStoreKind::Memset; } else if (!UnorderedAtomic && HasMemsetPattern && // Don't create memset_pattern16s with address spaces. StorePtr->getType()->getPointerAddressSpace() == 0 && @@ -500,7 +504,8 @@ case LegalStoreKind::None: // Nothing to do break; - case LegalStoreKind::Memset: { + case LegalStoreKind::Memset: + case LegalStoreKind::UnorderedAtomicMemset: { // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); StoreRefsForMemset[Ptr].push_back(SI); @@ -583,7 +588,8 @@ // all of the pairs of stores that follow each other. SmallVector IndexQueue; for (unsigned i = 0, e = SL.size(); i < e; ++i) { - assert(SL[i]->isSimple() && "Expected only non-volatile stores."); + assert(SL[i]->isUnordered() && + "Expected only non-volatile non-ordered stores."); Value *FirstStoredVal = SL[i]->getValueOperand(); Value *FirstStorePtr = SL[i]->getPointerOperand(); @@ -621,7 +627,8 @@ IndexQueue.push_back(j - 1); for (auto &k : IndexQueue) { - assert(SL[k]->isSimple() && "Expected only non-volatile stores."); + assert(SL[k]->isUnordered() && + "Expected only non-volatile non-ordered stores."); Value *SecondStorePtr = SL[k]->getPointerOperand(); const SCEVAddRecExpr *SecondStoreEv = cast(SE->getSCEV(SecondStorePtr)); @@ -912,8 +919,31 @@ CallInst *NewCall; if (SplatValue) { - NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + const auto *SI = dyn_cast(TheStore); + if (!(SI && SI->isAtomic())) + NewCall = + Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + else { + // Candidate for llvm.memset.element.unordered.atomic + + // The alignment must be at least the element size + if (StoreAlignment < StoreSize) + return false; + + // If the memset is not lowered into explicit stores later, + // then it will be lowered into an element-size specific lib call. + // If the lib call doesn't exist for our store size, then we + // shouldn't generate the memset. + if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) + return false; + + // Create the call. + // Note that unordered + // atomic stores are *required* by the spec to have an alignment + // but non-atomic loads/stores may not. + NewCall = Builder.CreateElementUnorderedAtomicMemSet( + BasePtr, StoreAlignment, SplatValue, NumBytes, StoreSize); + } } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; Index: test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll =================================================================== --- test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll +++ test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll @@ -407,50 +407,3 @@ for.end: ; preds = %for.body, %entry ret void } - - - -; Make sure that atomic memset doesn't get recognized by mistake -define void @test_nomemset(i8* %Base, i64 %Size) nounwind ssp { -; CHECK-LABEL: @test_nomemset( -; CHECK-NOT: call void @llvm.memset -; CHECK: store -; CHECK: ret void -bb.nph: ; preds = %entry - br label %for.body - -for.body: ; preds = %bb.nph, %for.body - %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] - %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar - store atomic i8 0, i8* %I.0.014 unordered, align 1 - %indvar.next = add i64 %indvar, 1 - %exitcond = icmp eq i64 %indvar.next, %Size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Verify that unordered memset_pattern isn't recognized. -; This is a replica of test11_pattern from basic.ll -define void @test_nomemset_pattern(i32* nocapture %P) nounwind ssp { -; CHECK-LABEL: @test_nomemset_pattern( -; CHECK-NEXT: entry: -; CHECK-NOT: bitcast -; CHECK-NOT: memset_pattern -; CHECK: store atomic -; CHECK: ret void -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] - %arrayidx = getelementptr i32, i32* %P, i64 %indvar - store atomic i32 1, i32* %arrayidx unordered, align 4 - %indvar.next = add i64 %indvar, 1 - %exitcond = icmp eq i64 %indvar.next, 10000 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} Index: test/Transforms/LoopIdiom/X86/unordered-atomic-memset.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/X86/unordered-atomic-memset.ll @@ -0,0 +1,277 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +;; memset.atomic formation (atomic store) +define void @test1(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test1( +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %Dest, i8 0, i64 %Size, i32 1) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i8, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + store atomic i8 0, i8* %DestI unordered, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation rejection (atomic store w/ bad align) +define void @test2(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test2( +; CHECK-NOT: call void @llvm.memset.element.unordered.atomic +; CHECK: store +; CHECK: ret void +bb.nph: + %Dest = alloca i32, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i32, i32* %Dest, i64 %indvar + store atomic i32 0, i32* %DestI unordered, align 2 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + + +;; memset.atomic formation rejection (ordered-atomic store) +define void @test3(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test3( +; CHECK-NOT: call void @llvm.memset.element.unordered.atomic +; CHECK: store +; CHECK: ret void +bb.nph: + %Dest = alloca i8, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + store atomic i8 0, i8* %DestI monotonic, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 2 +define void @test4(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test4( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 1 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 2 %Dest{{[0-9]*}}, i8 0, i64 [[Sz]], i32 2) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i16, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i16, i16* %Dest, i64 %indvar + store atomic i16 0, i16* %DestI unordered, align 2 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 2, non-zero value +define void @test4b(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test4b( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 1 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 2 %Dest{{[0-9]*}}, i8 -128, i64 [[Sz]], i32 2) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i16, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i16, i16* %Dest, i64 %indvar + store atomic i16 32896, i16* %DestI unordered, align 2 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 4 +define void @test5(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test5( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 2 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %Dest{{[0-9]*}}, i8 0, i64 [[Sz]], i32 4) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i32, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i32, i32* %Dest, i64 %indvar + store atomic i32 0, i32* %DestI unordered, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 4, non-zero value +define void @test5b(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test5b( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 2 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %Dest{{[0-9]*}}, i8 -128, i64 [[Sz]], i32 4) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i32, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i32, i32* %Dest, i64 %indvar + store atomic i32 2155905152, i32* %DestI unordered, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 8 +define void @test6(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test6( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 3 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %Dest{{[0-9]*}}, i8 0, i64 [[Sz]], i32 8) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i64, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i64, i64* %Dest, i64 %indvar + store atomic i64 0, i64* %DestI unordered, align 8 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation (atomic store) -- element size 8, non-zero value +define void @test6b(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test6b( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 3 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %Dest{{[0-9]*}}, i8 -128, i64 [[Sz]], i32 8) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i64, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i64, i64* %Dest, i64 %indvar + store atomic i64 9259542123273814144, i64* %DestI unordered, align 8 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation rejection (atomic store) -- element size 16 +define void @test7(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test7( +; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 4 +; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 16 %Dest{{[0-9]*}}, i8 0, i64 [[Sz]], i32 16) +; CHECK-NOT: store +; CHECK: ret void +bb.nph: + %Dest = alloca i128, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i128, i128* %Dest, i64 %indvar + store atomic i128 0, i128* %DestI unordered, align 16 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memset.atomic formation rejection (atomic store) -- element size 32 +define void @test8(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test8( +; CHECK-NOT: call void @llvm.memset.element.unordered.atomic +; CHECK: store +; CHECK: ret void +bb.nph: + %Dest = alloca i256, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i256, i256* %Dest, i64 %indvar + store atomic i256 0, i256* %DestI unordered, align 32 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Verify that unordered memset_pattern isn't recognized. +; This is a replica of test11_pattern from basic.ll +define void @test_nomemset_pattern(i32* nocapture %P) nounwind ssp { +; CHECK-LABEL: @test_nomemset_pattern( +; CHECK-NEXT: entry: +; CHECK-NOT: bitcast +; CHECK-NOT: memset_pattern +; CHECK: store atomic +; CHECK: ret void +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32, i32* %P, i64 %indvar + store atomic i32 1, i32* %arrayidx unordered, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopIdiom/unordered-atomic-memset-noarch.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/unordered-atomic-memset-noarch.ll @@ -0,0 +1,28 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +;; memset.atomic formation (atomic store) -- element size 2 +;; Will not create call due to a max element size of 0 +;; Note: This is a noarch test, so a call to +;; TTI->getAtomicMemIntrinsicMaxElementSize() will return 0, +;; thus preventing the transform. +define void @test1(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test1( +; CHECK-NOT: call void @llvm.memset.element.unordered.atomic +; CHECK: store +; CHECK: ret void +bb.nph: + %Dest = alloca i16, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %DestI = getelementptr i16, i16* %Dest, i64 %indvar + store atomic i16 0, i16* %DestI unordered, align 2 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +}