Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -267,6 +267,7 @@ void initializeLoopVersioningLegacyPassPass(PassRegistry &); void initializeLowerAtomicLegacyPassPass(PassRegistry&); void initializeLowerConstantIntrinsicsPass(PassRegistry&); +void initializeLowerGCLeafIntrinsicsLegacyPassPass(PassRegistry &); void initializeLowerEmuTLSPass(PassRegistry&); void initializeLowerExpectIntrinsicPass(PassRegistry&); void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -373,6 +373,12 @@ // Pass *createLowerAtomicPass(); +//===----------------------------------------------------------------------===// +// +// Lower GC leaf intrinsics - Lower gc-leaf calls to intrinsics into IR +// +FunctionPass *createLowerGCLeafIntrinsicsPass(); + //===----------------------------------------------------------------------===// // // LowerGuardIntrinsic - Lower guard intrinsics to normal control flow. Index: llvm/include/llvm/Transforms/Scalar/LowerGCLeafIntrinsics.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/LowerGCLeafIntrinsics.h @@ -0,0 +1,29 @@ +//===--- LowerGCLeafIntrinsics.h - lower gc leaf intrinsic calls -*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass tries to inline gc-leaf versions of intrinsics that may also have a +// non gc-leaf implementation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_LOWERGCLEAFINTRINSICS_H +#define LLVM_TRANSFORMS_SCALAR_LOWERGCLEAFINTRINSICS_H + +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class LowerGCLeafIntrinsicsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} + +#endif // LLVM_TRANSFORMS_SCALAR_LOWERGCLEAFINTRINSICS_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -182,6 +182,7 @@ #include "llvm/Transforms/Scalar/LoopVersioningLICM.h" #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" +#include "llvm/Transforms/Scalar/LowerGCLeafIntrinsics.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -239,6 +239,7 @@ FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings()) FUNCTION_PASS("instnamer", InstructionNamerPass()) FUNCTION_PASS("loweratomic", LowerAtomicPass()) +FUNCTION_PASS("lower-gc-leaf-intrinsics", LowerGCLeafIntrinsicsPass()) FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass()) FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass()) Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -49,6 +49,7 @@ LoopVersioningLICM.cpp LowerAtomic.cpp LowerConstantIntrinsics.cpp + LowerGCLeafIntrinsics.cpp LowerExpectIntrinsic.cpp LowerGuardIntrinsic.cpp LowerMatrixIntrinsics.cpp Index: llvm/lib/Transforms/Scalar/LowerGCLeafIntrinsics.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/LowerGCLeafIntrinsics.cpp @@ -0,0 +1,306 @@ +//===- LowerGCLeafIntrinsics.cpp - lower gc leaf intrinsic calls -*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass tries to inline gc-leaf versions of intrinsics that may also have a +// non gc-leaf implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerGCLeafIntrinsics.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "lower-gc-leaf-intrinsics" + +static cl::opt +UsePrefetching("lower-gc-leaf-intrinsics-use-prefetching", cl::init(true), + cl::Hidden, + cl::desc("Use software prefetching when lowering intrinsics.")); + +STATISTIC(NumAtomicMemCpyLowered, + "Number of atomic memcpy instructions lowered"); + +static bool lowerAtomicMemCpy(AtomicMemCpyInst *MemCpy, + const TargetTransformInfo *TTI, + DominatorTree *DT) { + Value *Src = MemCpy->getRawSource(); + PointerType *SrcType = cast(Src->getType()); + + Value *Dest = MemCpy->getRawDest(); + PointerType *DestType = cast(Dest->getType()); + + // Take vector width suitable for both address spaces. + uint64_t ElementSizeInBytes = MemCpy->getElementSizeInBytes(); + uint64_t MaxVectorSizeInBits = + std::min(TTI->getLoadStoreVecRegBitWidth(SrcType->getAddressSpace()), + TTI->getLoadStoreVecRegBitWidth(DestType->getAddressSpace())); + assert(((MaxVectorSizeInBits & 7) == 0) && "Fractional number of bytes?"); + uint64_t MaxVectorSizeInBytes = MaxVectorSizeInBits / 8; + if (MaxVectorSizeInBytes == 0 || + MaxVectorSizeInBytes % ElementSizeInBytes != 0) + return false; + + uint64_t ElementsPerIteration = MaxVectorSizeInBytes / ElementSizeInBytes; + assert(ElementsPerIteration != 0 && "Zero vector length is impossible!"); + + IRBuilder<> Builder(MemCpy); + LLVMContext &C = MemCpy->getContext(); + auto *LenInBytes = MemCpy->getLength(); + Type *LenType = LenInBytes->getType(); + if (!LenType->isIntegerTy(64)) { + assert(LenInBytes->getType()->isIntegerTy(32) && + "Only 32 and 64-bit lengths are allowed!"); + LenInBytes = Builder.CreateZExt(LenInBytes, Type::getInt64Ty(C), + LenInBytes->getName() + ".wide"); + LenType = Type::getInt64Ty(C); + } + auto *Len = Builder.CreateUDiv(LenInBytes, + ConstantInt::get(LenType, ElementSizeInBytes), + "elements.len"); + Value *ElementsPerVectorizedLoopIter = + ConstantInt::get(LenType, ElementsPerIteration); + Value *LoopCond = + Builder.CreateICmpSLT(Len, ElementsPerVectorizedLoopIter, "loop.cond"); + BasicBlock *BB = Builder.GetInsertBlock(); + BasicBlock *Preheader = SplitBlock(BB, BB->getTerminator(), DT); + Builder.SetInsertPoint(Preheader->getTerminator()); + Preheader->setName("memcpy.loop.preheader"); + Value *NumIter = + Builder.CreateUDiv(Len, ElementsPerVectorizedLoopIter, "loop.iters"); + + // Create a vectorized loop that will copy majority of data using widest + // registers available. + BasicBlock *Loop = SplitBlock(Preheader, Preheader->getTerminator(), DT); + Loop->setName("memcpy.loop"); + + BasicBlock *Tail = SplitBlock(Loop, Loop->getTerminator(), DT); + + Builder.SetInsertPoint(Loop); + Loop->getTerminator()->eraseFromParent(); + + PHINode *Idx = Builder.CreatePHI(Type::getInt64Ty(C), 2, "idx"); + auto *SrcIdx = Builder.CreatePHI(Src->getType(), 2, "src.idx"); + auto *DestIdx = Builder.CreatePHI(Dest->getType(), 2, "dst.idx"); + + // Prefetch data. + if (UsePrefetching) { + auto *Int32Ty = Type::getInt32Ty(C); + auto *ReadPrefetch = ConstantInt::get(Int32Ty, 0); + auto *WritePrefetch = ConstantInt::get(Int32Ty, 1); + auto *PrefetchLocality = ConstantInt::get(Int32Ty, 0); + auto *DataCacheType = ConstantInt::get(Int32Ty, 1); + + Builder.CreateIntrinsic( + Intrinsic::prefetch, { SrcIdx->getType() }, + { SrcIdx, ReadPrefetch, PrefetchLocality, DataCacheType }); + Builder.CreateIntrinsic( + Intrinsic::prefetch, { DestIdx->getType() }, + { DestIdx, WritePrefetch, PrefetchLocality, DataCacheType }); + } + + // Copy from Src to Dest and return pointers to the end of the copied regions. + auto GenerateCopy = [&](unsigned ElementsPerIteration, Value *Src, + Value *Dest) { + Type *ValueType = + VectorType::get(Type::getIntNTy(C, 8 * ElementSizeInBytes), + ElementsPerIteration, false); + Type *SrcValuePtrType = + PointerType::get(ValueType, SrcType->getAddressSpace()); + Type *DestValuePtrType = + PointerType::get(ValueType, DestType->getAddressSpace()); + Value *SrcBC = Builder.CreateBitCast(Src, SrcValuePtrType, "src.vec"); + LoadInst *Val = Builder.CreateLoad(ValueType, SrcBC, "vec"); + Val->setAlignment(Align(ElementSizeInBytes)); + Value *DestBC = Builder.CreateBitCast(Dest, DestValuePtrType, "dst.vec"); + StoreInst *Store = Builder.CreateStore(Val, DestBC); + Store->setAlignment(Align(ElementSizeInBytes)); + auto *Int8Ty = Type::getInt8Ty(C); + auto *SrcNext = cast(Builder.CreateGEP( + Int8Ty, SrcIdx, + ConstantInt::get(LenType, ElementsPerIteration * ElementSizeInBytes), + "src.next")); + auto *DestNext = cast(Builder.CreateGEP( + Int8Ty, DestIdx, + ConstantInt::get(LenType, ElementsPerIteration * ElementSizeInBytes), + "dest.next")); + return std::make_pair(SrcNext, DestNext); + }; + + auto Next = GenerateCopy(ElementsPerIteration, SrcIdx, DestIdx); + Instruction *SrcNext = Next.first; + Instruction *DestNext = Next.second; + + Value *IdxNext = + Builder.CreateSub(Idx, ConstantInt::get(LenType, 1), "idx.next"); + Value *MemcpyLoopCond = Builder.CreateICmpNE( + IdxNext, ConstantInt::getNullValue(LenType), "memcpy-loop.cond"); + Builder.CreateCondBr(MemcpyLoopCond, Loop, Tail); + SrcIdx->addIncoming(Src, Preheader); + SrcIdx->addIncoming(SrcNext, Loop); + DestIdx->addIncoming(Dest, Preheader); + DestIdx->addIncoming(DestNext, Loop); + Idx->addIncoming(NumIter, Preheader); + Idx->addIncoming(IdxNext, Loop); + + // And then construct a tail that will handle the remainder with more narrow + // registers. + Builder.SetInsertPoint(BB->getTerminator()); + BranchInst *NewTerminator = BranchInst::Create(Tail, Preheader, LoopCond); + ReplaceInstWithInst(BB->getTerminator(), NewTerminator); + if (DT) + DT->insertEdge(BB, Tail); + + Builder.SetInsertPoint(Tail->getTerminator()); + Instruction *Term = Tail->getTerminator(); + + SrcIdx = Builder.CreatePHI(SrcIdx->getType(), 2, "src.idx"); + SrcIdx->addIncoming(Src, BB); + SrcIdx->addIncoming(SrcNext, Loop); + + DestIdx = Builder.CreatePHI(DestIdx->getType(), 2, "dest.idx"); + DestIdx->addIncoming(Dest, BB); + DestIdx->addIncoming(DestNext, Loop); + + for (unsigned VF = MaxVectorSizeInBytes / 2; VF >= ElementSizeInBytes; + VF /= 2) { + ElementsPerIteration = VF / ElementSizeInBytes; + + Value *TestBit = Builder.CreateAnd(Len, ElementsPerIteration); + Value *Cond = + Builder.CreateICmpNE(TestBit, ConstantInt::getNullValue(LenType)); + Term->getParent()->setName(StringRef("need_check_") + + std::to_string(ElementsPerIteration) + "_"); + Builder.SetInsertPoint(SplitBlockAndInsertIfThen( + Cond, Term, false, /*BrMetaData*/ nullptr, DT)); + Builder.GetInsertPoint()->getParent()->setName( + StringRef("tail_") + std::to_string(ElementsPerIteration) + "_"); + + auto Next = GenerateCopy(ElementsPerIteration, SrcIdx, DestIdx); + Instruction *SrcNext = Next.first; + Instruction *DestNext = Next.second; + if (VF > ElementSizeInBytes) { + Builder.SetInsertPoint(Term); + + auto *NewSrcIdx = Builder.CreatePHI(SrcIdx->getType(), 2, "src.idx"); + NewSrcIdx->addIncoming(SrcIdx, SrcIdx->getParent()); + NewSrcIdx->addIncoming(SrcNext, SrcNext->getParent()); + SrcIdx = NewSrcIdx; + + auto *NewDestIdx = Builder.CreatePHI(DestIdx->getType(), 2, "dest.idx"); + NewDestIdx->addIncoming(DestIdx, DestIdx->getParent()); + NewDestIdx->addIncoming(DestNext, DestNext->getParent()); + DestIdx = NewDestIdx; + } else { + SrcNext->eraseFromParent(); + DestNext->eraseFromParent(); + } + } + + ++NumAtomicMemCpyLowered; + MemCpy->eraseFromParent(); + return true; +} + +static bool lowerCall(CallInst *CI, const TargetTransformInfo *TTI, + DominatorTree *DT) { + if (auto *MemCpy = dyn_cast(CI)) + return lowerAtomicMemCpy(MemCpy, TTI, DT); + return false; +} + +static bool lowerGCLeafIntrinsics(Function &F, const TargetTransformInfo *TTI, + DominatorTree *DT) { + // Intrinsic inlining will blow up the code size, so don't do it if it's our + // concern. + if (F.hasOptSize()) + return false; + + SmallVector Candidates; + // Collect all GC leaf calls as potential candidates. + for (Instruction &I : instructions(F)) + if (auto *CI = dyn_cast(&I)) + if (CI->hasFnAttr("gc-leaf-function")) + Candidates.push_back(CI); + + bool Changed = false; + for (auto *Candidate : Candidates) + Changed |= lowerCall(Candidate, TTI, DT); + +#ifndef NDEBUG + if (DT) + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif + return Changed; +} + +PreservedAnalyses LowerGCLeafIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult(F); + // auto *DT = AM.getCachedResult(F); + auto *DT = &AM.getResult(F); + if (!lowerGCLeafIntrinsics(F, &TTI, DT)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +namespace { +class LowerGCLeafIntrinsicsLegacyPass : public FunctionPass { +public: + static char ID; + + LowerGCLeafIntrinsicsLegacyPass() : FunctionPass(ID) { + initializeLowerGCLeafIntrinsicsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + const TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + DominatorTree *DT = nullptr; + if (auto *DTWP = getAnalysisIfAvailable()) + DT = &DTWP->getDomTree(); + return lowerGCLeafIntrinsics(F, TTI, DT); + } +}; +} + +char LowerGCLeafIntrinsicsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LowerGCLeafIntrinsicsLegacyPass, + "lower-gc-leaf-intrinsics", + "Lower GC leaf intrinsic calls", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(LowerGCLeafIntrinsicsLegacyPass, "lower-gc-leaf-intrinsics", + "Lower GC leaf intrinsic calls", false, false) + +FunctionPass *llvm::createLowerGCLeafIntrinsicsPass() { + return new LowerGCLeafIntrinsicsLegacyPass(); +} Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -82,6 +82,7 @@ initializeLoopIdiomRecognizeLegacyPassPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); initializeLowerConstantIntrinsicsPass(Registry); + initializeLowerGCLeafIntrinsicsLegacyPassPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); Index: llvm/test/Transforms/LowerGCLeafIntrinsics/memcpy.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LowerGCLeafIntrinsics/memcpy.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -lower-gc-leaf-intrinsics -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s +; RUN: opt -S -passes=lower-gc-leaf-intrinsics -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s +; RUN: opt -S -domtree -lower-gc-leaf-intrinsics -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @llvm.memcpy.element.unordered.atomic.p1i8.p1i8.i32(i8 addrspace(1)* nocapture writeonly, i8 addrspace(1)* nocapture readonly, i32, i32) nounwind argmemonly + +; GC-leaf memcpy can be lowered into vector loop. +define void @test_memcpy_gc_leaf(i8 addrspace(1)* align 16 %src, i8 addrspace(1)* align 16 %dest, i32 %len) gc "statepoint-example" { +; CHECK-LABEL: @test_memcpy_gc_leaf( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN_WIDE:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[ELEMENTS_LEN:%.*]] = udiv i64 [[LEN_WIDE]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i64 [[ELEMENTS_LEN]], 8 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[NEED_CHECK_4_:%.*]], label [[MEMCPY_LOOP_PREHEADER:%.*]] +; CHECK: memcpy.loop.preheader: +; CHECK-NEXT: [[LOOP_ITERS:%.*]] = udiv i64 [[ELEMENTS_LEN]], 8 +; CHECK-NEXT: br label [[MEMCPY_LOOP:%.*]] +; CHECK: memcpy.loop: +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[LOOP_ITERS]], [[MEMCPY_LOOP_PREHEADER]] ], [ [[IDX_NEXT:%.*]], [[MEMCPY_LOOP]] ] +; CHECK-NEXT: [[SRC_IDX:%.*]] = phi i8 addrspace(1)* [ [[SRC:%.*]], [[MEMCPY_LOOP_PREHEADER]] ], [ [[SRC_NEXT:%.*]], [[MEMCPY_LOOP]] ] +; CHECK-NEXT: [[DST_IDX:%.*]] = phi i8 addrspace(1)* [ [[DEST:%.*]], [[MEMCPY_LOOP_PREHEADER]] ], [ [[DEST_NEXT:%.*]], [[MEMCPY_LOOP]] ] +; CHECK-NEXT: call void @llvm.prefetch.p1i8(i8 addrspace(1)* [[SRC_IDX]], i32 0, i32 0, i32 1) +; CHECK-NEXT: call void @llvm.prefetch.p1i8(i8 addrspace(1)* [[DST_IDX]], i32 1, i32 0, i32 1) +; CHECK-NEXT: [[SRC_VEC:%.*]] = bitcast i8 addrspace(1)* [[SRC_IDX]] to <8 x i32> addrspace(1)* +; CHECK-NEXT: [[VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[SRC_VEC]], align 4 +; CHECK-NEXT: [[DST_VEC:%.*]] = bitcast i8 addrspace(1)* [[DST_IDX]] to <8 x i32> addrspace(1)* +; CHECK-NEXT: store <8 x i32> [[VEC]], <8 x i32> addrspace(1)* [[DST_VEC]], align 4 +; CHECK-NEXT: [[SRC_NEXT]] = getelementptr i8, i8 addrspace(1)* [[SRC_IDX]], i64 32 +; CHECK-NEXT: [[DEST_NEXT]] = getelementptr i8, i8 addrspace(1)* [[DST_IDX]], i64 32 +; CHECK-NEXT: [[IDX_NEXT]] = sub i64 [[IDX]], 1 +; CHECK-NEXT: [[MEMCPY_LOOP_COND:%.*]] = icmp ne i64 [[IDX_NEXT]], 0 +; CHECK-NEXT: br i1 [[MEMCPY_LOOP_COND]], label [[MEMCPY_LOOP]], label [[NEED_CHECK_4_]] +; CHECK: need_check_4_: +; CHECK-NEXT: [[SRC_IDX1:%.*]] = phi i8 addrspace(1)* [ [[SRC]], [[ENTRY:%.*]] ], [ [[SRC_NEXT]], [[MEMCPY_LOOP]] ] +; CHECK-NEXT: [[DEST_IDX:%.*]] = phi i8 addrspace(1)* [ [[DEST]], [[ENTRY]] ], [ [[DEST_NEXT]], [[MEMCPY_LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = and i64 [[ELEMENTS_LEN]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[TAIL_4_:%.*]], label [[NEED_CHECK_2_:%.*]] +; CHECK: tail_4_: +; CHECK-NEXT: [[SRC_VEC2:%.*]] = bitcast i8 addrspace(1)* [[SRC_IDX1]] to <4 x i32> addrspace(1)* +; CHECK-NEXT: [[VEC3:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[SRC_VEC2]], align 4 +; CHECK-NEXT: [[DST_VEC4:%.*]] = bitcast i8 addrspace(1)* [[DEST_IDX]] to <4 x i32> addrspace(1)* +; CHECK-NEXT: store <4 x i32> [[VEC3]], <4 x i32> addrspace(1)* [[DST_VEC4]], align 4 +; CHECK-NEXT: [[SRC_NEXT5:%.*]] = getelementptr i8, i8 addrspace(1)* [[SRC_IDX1]], i64 16 +; CHECK-NEXT: [[DEST_NEXT6:%.*]] = getelementptr i8, i8 addrspace(1)* [[DEST_IDX]], i64 16 +; CHECK-NEXT: br label [[NEED_CHECK_2_]] +; CHECK: need_check_2_: +; CHECK-NEXT: [[SRC_IDX7:%.*]] = phi i8 addrspace(1)* [ [[SRC_IDX1]], [[NEED_CHECK_4_]] ], [ [[SRC_NEXT5]], [[TAIL_4_]] ] +; CHECK-NEXT: [[DEST_IDX8:%.*]] = phi i8 addrspace(1)* [ [[DEST_IDX]], [[NEED_CHECK_4_]] ], [ [[DEST_NEXT6]], [[TAIL_4_]] ] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[ELEMENTS_LEN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[TAIL_2_:%.*]], label [[NEED_CHECK_1_:%.*]] +; CHECK: tail_2_: +; CHECK-NEXT: [[SRC_VEC9:%.*]] = bitcast i8 addrspace(1)* [[SRC_IDX7]] to <2 x i32> addrspace(1)* +; CHECK-NEXT: [[VEC10:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[SRC_VEC9]], align 4 +; CHECK-NEXT: [[DST_VEC11:%.*]] = bitcast i8 addrspace(1)* [[DEST_IDX8]] to <2 x i32> addrspace(1)* +; CHECK-NEXT: store <2 x i32> [[VEC10]], <2 x i32> addrspace(1)* [[DST_VEC11]], align 4 +; CHECK-NEXT: [[SRC_NEXT12:%.*]] = getelementptr i8, i8 addrspace(1)* [[SRC_IDX7]], i64 8 +; CHECK-NEXT: [[DEST_NEXT13:%.*]] = getelementptr i8, i8 addrspace(1)* [[DEST_IDX8]], i64 8 +; CHECK-NEXT: br label [[NEED_CHECK_1_]] +; CHECK: need_check_1_: +; CHECK-NEXT: [[SRC_IDX14:%.*]] = phi i8 addrspace(1)* [ [[SRC_IDX7]], [[NEED_CHECK_2_]] ], [ [[SRC_NEXT12]], [[TAIL_2_]] ] +; CHECK-NEXT: [[DEST_IDX15:%.*]] = phi i8 addrspace(1)* [ [[DEST_IDX8]], [[NEED_CHECK_2_]] ], [ [[DEST_NEXT13]], [[TAIL_2_]] ] +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[ELEMENTS_LEN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TAIL_1_:%.*]], label [[TMP6:%.*]] +; CHECK: tail_1_: +; CHECK-NEXT: [[SRC_VEC16:%.*]] = bitcast i8 addrspace(1)* [[SRC_IDX14]] to <1 x i32> addrspace(1)* +; CHECK-NEXT: [[VEC17:%.*]] = load <1 x i32>, <1 x i32> addrspace(1)* [[SRC_VEC16]], align 4 +; CHECK-NEXT: [[DST_VEC18:%.*]] = bitcast i8 addrspace(1)* [[DEST_IDX15]] to <1 x i32> addrspace(1)* +; CHECK-NEXT: store <1 x i32> [[VEC17]], <1 x i32> addrspace(1)* [[DST_VEC18]], align 4 +; CHECK-NEXT: br label [[TMP6]] +; CHECK: 6: +; CHECK-NEXT: ret void +; +entry: + call void @llvm.memcpy.element.unordered.atomic.p1i8.p1i8.i32(i8 addrspace(1)* align 16 %dest, i8 addrspace(1)* align 16 %src, i32 %len, i32 4) #0 + ret void +} + +; This may trigger GC, so we should not lower it +define void @test_memcpy_gc(i8 addrspace(1)* align 16 %src, i8 addrspace(1)* align 16 %dest, i32 %len) gc "statepoint-example" { +; CHECK-LABEL: @test_memcpy_gc( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p1i8.p1i8.i32(i8 addrspace(1)* align 16 [[DEST:%.*]], i8 addrspace(1)* align 16 [[SRC:%.*]], i32 [[LEN:%.*]], i32 4) [ "deopt"() ] +; CHECK-NEXT: ret void +; +entry: + call void @llvm.memcpy.element.unordered.atomic.p1i8.p1i8.i32(i8 addrspace(1)* align 16 %dest, i8 addrspace(1)* align 16 %src, i32 %len, i32 4) [ "deopt"() ] + ret void +} + +attributes #0 = { "gc-leaf-function" }