Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,7 @@ #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/DataTypes.h" @@ -35,6 +36,20 @@ class User; class Value; +/// \brief Information about a load/store intrinsic defined by the target. +struct MemIntrinsicInfo { + MemIntrinsicInfo() + : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0), + NumMemRefs(0), PtrVal(nullptr) {} + bool ReadMem; + bool WriteMem; + bool Vol; + // Same Id is set by the target for corresponding load/store intrinsics. + unsigned short MatchingId; + int NumMemRefs; + Value *PtrVal; +}; + /// TargetTransformInfo - This pass provides access to the codegen /// interfaces that are needed for IR-level transformations. class TargetTransformInfo { @@ -438,6 +453,20 @@ /// any callee-saved registers, so would require a spill and fill. virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) const; + /// \returns True if the intrinsic is a supported memory intrinsic. Info + /// will contain additional information - whether the intrinsic may write + /// or read to memory, volatility and the pointer. Info is undefined + /// if false is returned. + virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const; + + /// \returns A value which is the result of the given memory intrinsic. New + /// instructions may be created to extract the result from the given intrinsic + /// memory operation. Returns nullptr if the target cannot create a result + /// from the given intrinsic. + virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) const; + /// @} /// Analysis group identification. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -247,6 +247,16 @@ return PrevTTI->getCostOfKeepingLiveOverCall(Tys); } +Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType) const { + return PrevTTI->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); +} + +bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + return PrevTTI->getTgtMemIntrinsic(Inst, Info); +} + namespace { struct NoTTI final : ImmutablePass, TargetTransformInfo { @@ -644,6 +654,15 @@ return 0; } + bool getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const override { + return false; + } + + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) const override { + return nullptr; + } }; } // end anonymous namespace Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -44,6 +44,12 @@ /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + enum MemIntrinsicType { + VECTOR_LDST_TWO_ELEMENTS, + VECTOR_LDST_THREE_ELEMENTS, + VECTOR_LDST_FOUR_ELEMENTS + }; + public: AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); @@ -131,6 +137,11 @@ void getUnrollingPreferences(const Function *F, Loop *L, UnrollingPreferences &UP) const override; + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) const override; + + bool getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const override; /// @} }; @@ -554,3 +565,83 @@ // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; } + +Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) const { + switch (Inst->getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: { + // Create a struct type + StructType *ST = dyn_cast(ExpectedType); + if (!ST) + return nullptr; + unsigned NumElts = Inst->getNumArgOperands() - 1; + if (ST->getNumElements() != NumElts) + return nullptr; + for (unsigned i = 0, e = NumElts; i != e; ++i) { + if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) + return nullptr; + } + Value *Res = UndefValue::get(ExpectedType); + IRBuilder<> Builder(Inst); + for (unsigned i = 0, e = NumElts; i != e; ++i) { + Value *L = Inst->getArgOperand(i); + Res = Builder.CreateInsertValue(Res, L, i); + } + return Res; + } + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + if (Inst->getType() == ExpectedType) + return Inst; + return nullptr; + } +} + +bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + switch (Inst->getIntrinsicID()) { + default: + break; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + Info.ReadMem = true; + Info.WriteMem = false; + Info.Vol = false; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(0); + break; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + Info.ReadMem = false; + Info.WriteMem = true; + Info.Vol = false; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); + break; + } + + switch (Inst->getIntrinsicID()) { + default: + return false; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_st2: + Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_st3: + Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_st4: + Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; + break; + } + return true; +} Index: lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- lib/Transforms/Scalar/EarlyCSE.cpp +++ lib/Transforms/Scalar/EarlyCSE.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -273,6 +274,7 @@ public: const DataLayout *DL; const TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; DominatorTree *DT; AssumptionCache *AC; typedef RecyclingAllocator< @@ -383,14 +385,83 @@ bool Processed; }; + /// \brief Wrapper class to handle memory instructions, including loads, + /// stores and intrinsic loads and stores defined by the target. + class ParseMemoryInst { + public: + ParseMemoryInst(Instruction *Inst, const TargetTransformInfo *TTI) + : Load(false), Store(false), Vol(false), MayReadFromMemory(false), + MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { + MayReadFromMemory = Inst->mayReadFromMemory(); + MayWriteToMemory = Inst->mayWriteToMemory(); + if (IntrinsicInst *II = dyn_cast(Inst)) { + MemIntrinsicInfo Info; + if (!TTI->getTgtMemIntrinsic(II, Info)) + return; + if (Info.NumMemRefs == 1) { + Store = Info.WriteMem; + Load = Info.ReadMem; + MatchingId = Info.MatchingId; + MayReadFromMemory = Info.ReadMem; + MayWriteToMemory = Info.WriteMem; + Vol = Info.Vol; + Ptr = Info.PtrVal; + } + } else if (LoadInst *LI = dyn_cast(Inst)) { + Load = true; + Vol = !LI->isSimple(); + Ptr = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast(Inst)) { + Store = true; + Vol = !SI->isSimple(); + Ptr = SI->getPointerOperand(); + } + } + bool isLoad() { return Load; } + bool isStore() { return Store; } + bool isVolatile() { return Vol; } + bool isMatchingMemLoc(const ParseMemoryInst &Inst) { + return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + } + bool isValid() { return Ptr != nullptr; } + int getMatchingId() { return MatchingId; } + Value *getPtr() { return Ptr; } + bool mayReadFromMemory() { return MayReadFromMemory; } + bool mayWriteToMemory() { return MayWriteToMemory; } + + private: + bool Load; + bool Store; + bool Vol; + bool MayReadFromMemory; + bool MayWriteToMemory; + // For regular (non-intrinsic) loads/stores, this is set to -1. For + // intrinsic loads/stores, the id is retrieved from the corresponding + // field in the MemIntrinsicInfo structure. That field contains + // non-negative values only. + int MatchingId; + Value *Ptr; + }; + bool processNode(DomTreeNode *Node); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } + + Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + if (LoadInst *LI = dyn_cast(Inst)) + return LI; + else if (StoreInst *SI = dyn_cast(Inst)) + return SI->getValueOperand(); + assert(isa(Inst) && "Instruction not supported"); + return TTI->getOrCreateResultFromMemIntrinsic(cast(Inst), + ExpectedType); + } }; } @@ -420,7 +491,7 @@ /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = nullptr; + Instruction *LastStore = nullptr; bool Changed = false; @@ -475,10 +546,11 @@ continue; } + ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. - if (LoadInst *LI = dyn_cast(Inst)) { + if (MemInst.isValid() && MemInst.isLoad()) { // Ignore volatile loads. - if (!LI->isSimple()) { + if (MemInst.isVolatile()) { LastStore = nullptr; continue; } @@ -486,27 +558,35 @@ // If we have an available version of this load, and if it is the right // generation, replace this instruction. std::pair InVal = - AvailableLoads->lookup(Inst->getOperand(0)); + AvailableLoads->lookup(MemInst.getPtr()); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.first << '\n'); - if (!Inst->use_empty()) - Inst->replaceAllUsesWith(InVal.first); - Inst->eraseFromParent(); - Changed = true; - ++NumCSELoad; - continue; + Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + if (Op != nullptr) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(Op); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } } // Otherwise, remember that we have this instruction. - AvailableLoads->insert(Inst->getOperand(0), std::pair( - Inst, CurrentGeneration)); + AvailableLoads->insert(MemInst.getPtr(), std::pair( + Inst, CurrentGeneration)); LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. - if (Inst->mayReadFromMemory()) + // Load/store intrinsics will indicate both a read and a write to + // memory. The target may override this (e.g. so that a store intrinsic + // does not read from memory, and thus will be treated the same as a + // regular store for commoning purposes). + if (Inst->mayReadFromMemory() && + !(MemInst.isValid() && !MemInst.mayReadFromMemory())) LastStore = nullptr; // If this is a read-only call, process it. @@ -537,17 +617,19 @@ if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - if (StoreInst *SI = dyn_cast(Inst)) { + if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. - if (LastStore && - LastStore->getPointerOperand() == SI->getPointerOperand()) { - DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore - << " due to: " << *Inst << '\n'); - LastStore->eraseFromParent(); - Changed = true; - ++NumDSE; - LastStore = nullptr; + if (LastStore) { + ParseMemoryInst LastStoreMemInst(LastStore, TTI); + if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore + << " due to: " << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = nullptr; + } // fallthrough - we can exploit information about this store } @@ -556,13 +638,12 @@ // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads->insert(SI->getPointerOperand(), - std::pair( - SI->getValueOperand(), CurrentGeneration)); + AvailableLoads->insert(MemInst.getPtr(), std::pair( + Inst, CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (SI->isSimple()) - LastStore = SI; + if (!MemInst.isVolatile()) + LastStore = Inst; } } } @@ -584,6 +665,7 @@ DataLayoutPass *DLP = getAnalysisIfAvailable(); DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis().getTLI(); + TTI = &getAnalysis(); DT = &getAnalysis().getDomTree(); AC = &getAnalysis().getAssumptionCache(F); Index: test/Transforms/EarlyCSE/AArch64/intrinsics.ll =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/AArch64/intrinsics.ll @@ -0,0 +1,231 @@ +; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s + +define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse2 +; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) +; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { +entry: +; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. +; CHECK-LABEL: @test_cse3 +; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %1 = bitcast i32* %a to i8* + %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1) + %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 + %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + + +define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized +; away by Early CSE. +; CHECK-LABEL: @test_nocse +; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + store i32 0, i32* %b, align 4 + %5 = bitcast i32* %a to i8* + %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due +; to mismatch between st2 and ld3. +; CHECK-LABEL: @test_nocse2 +; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) + %5 = bitcast i32* %a to i8* + %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +entry: +; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to +; mismatch between st2 and st3. +; CHECK-LABEL: @test_nocse3 +; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8 +; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8 + %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 + %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = bitcast i32* %a to i8* + %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> + %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> + %3 = bitcast <16 x i8> %1 to <4 x i32> + %4 = bitcast <16 x i8> %2 to <4 x i32> + call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0) + call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) + %5 = bitcast i32* %a to i8* + %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 + %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret <4 x i32> %res.0 +} + +; Function Attrs: nounwind +declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture) + +; Function Attrs: nounwind +declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture) + +; Function Attrs: nounwind readonly +declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*) + +; Function Attrs: nounwind readonly +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*) + +define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { +entry: + %add = add <4 x i32> %__p0, %__p1 + ret <4 x i32> %add +} Index: test/Transforms/EarlyCSE/AArch64/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/AArch64/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +targets = set(config.root.targets_to_build.split()) +if not 'AArch64' in targets: + config.unsupported = True