Index: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp @@ -11,6 +11,7 @@ /// Armv6 introduced instructions to perform 32-bit SIMD operations. The /// purpose of this pass is do some IR pattern matching to create ACLE /// DSP intrinsics, which map on these 32-bit SIMD operations. +/// This pass runs only when unaligned accesses is supported/enabled. // //===----------------------------------------------------------------------===// @@ -64,7 +65,16 @@ MemInstList VecLd; // List of all load instructions of this Mul MemLocList MemLocs; // All memory locations read by this Mul - ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {}; + // The MAC-chains we currently recognise are simple chains that accumulate + // their results with a reducing integer add statement, and consist of + // a chain of adds and muls, which have only sext and load instructions as + // operands. Thus, these chains don't write memory. We check that this is + // true when we collect the operands, and use this in alias analysis checks + // that different parallel MACs don't interfere with each other. + bool ReadOnly; + + ParallelMAC(Instruction *I, ValueList &V, bool RdOnly) + : Mul(I), VL(V), ReadOnly(RdOnly) {}; }; struct Reduction { @@ -73,6 +83,8 @@ Instruction *AccIntAdd; // The accumulating integer add statement, // i.e, the reduction statement. + ParallelMACList MACCandidates; // The MAC candidates associated with + // this reduction statement. Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; }; @@ -380,8 +392,10 @@ const BasicBlock *Latch = TheLoop->getLoopLatch(); // We need a preheader as getIncomingValueForBlock assumes there is one. - if (!TheLoop->getLoopPreheader()) + if (!TheLoop->getLoopPreheader()) { + LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n"); return Reductions; + } for (PHINode &Phi : Header->phis()) { const auto *Ty = Phi.getType(); @@ -412,7 +426,7 @@ return Reductions; } -static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc, +static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc, Value *MulOp0, Value *MulOp1, int MulOpNum) { Instruction *Mul = dyn_cast(Acc->getOperand(MulOpNum)); LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); @@ -420,7 +434,15 @@ if (IsNarrowSequence<16>(MulOp0, VL) && IsNarrowSequence<16>(MulOp1, VL)) { LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); - Candidates.push_back(ParallelMAC(Mul, VL)); + + bool MayWriteMem = false; + for (auto &V : VL) { + if (dyn_cast(V)->mayWriteToMemory()) { + MayWriteMem = true; + break; + } + } + Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem)); } } @@ -433,20 +455,20 @@ // Pattern 1: the accumulator is the RHS of the mul. while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A)))){ - AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0); + AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); Acc = dyn_cast(A); } // Pattern 2: the accumulator is the LHS of the mul. while(match(Acc, m_Add(m_Value(A), m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) { - AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 1); + AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1); Acc = dyn_cast(A); } // The last mul in the chain has a slightly different pattern: // the mul is the first operand if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A)))) - AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0); + AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); // Because we start at the bottom of the chain, and we work our way up, // the muls are added in reverse program order to the list. @@ -456,35 +478,35 @@ // Collects all instructions that are not part of the MAC chains, which is the // set of instructions that can potentially alias with the MAC operands. -static Instructions AliasCandidates(BasicBlock *Header, - ParallelMACList &MACCandidates) { - Instructions Aliases; - auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) { - for (auto &MAC : MACCandidates) - for (auto *Val : MAC.VL) - if (I == MAC.Mul || Val == I) - return true; - return false; - }; - - std::for_each(Header->begin(), Header->end(), - [&Aliases, &MACCandidates, &IsMACCandidate] (Instruction &I) { - if (I.mayReadOrWriteMemory() && - !IsMACCandidate(&I, MACCandidates)) - Aliases.push_back(&I); }); - return Aliases; +static void AliasCandidates(BasicBlock *Header, Instructions &Reads, + Instructions &Writes) { + for (auto &I : *Header) { + if (I.mayReadFromMemory()) + Reads.push_back(&I); + if (I.mayWriteToMemory()) + Writes.push_back(&I); + } } -// This compares all instructions from the "alias candidates" set, i.e., all -// instructions that are not part of the MAC-chain, with all instructions in -// the MAC candidate set, to see if instructions are aliased. -static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates, - ParallelMACList &MACCandidates) { +// Check whether statements in the basic block that write to memory alias with +// the memory locations accessed by the MAC-chains. +// TODO: we need the read statements when we accept more complicated chains. +static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, + Instructions &Writes, ParallelMACList &MACCandidates) { LLVM_DEBUG(dbgs() << "Alias checks:\n"); - for (auto *I : AliasCandidates) { - LLVM_DEBUG(dbgs() << "- "; I->dump()); - for (auto &MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump()); + for (auto &MAC : MACCandidates) { + LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump()); + + // At the moment, we allow only simple chains that only consist of reads, + // accumulate their result with an integer add, and thus that don't write + // memory, and simply bail if they do. + if (!MAC.ReadOnly) + return true; + + // Now for all writes in the basic block, check that they don't alias with + // the memory locations accessed by our MAC-chain: + for (auto *I : Writes) { + LLVM_DEBUG(dbgs() << "- "; I->dump()); assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs"); for (auto &MemLoc : MAC.MemLocs) { if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), @@ -495,6 +517,7 @@ } } } + LLVM_DEBUG(dbgs() << "OK: no aliases found!\n"); return false; } @@ -554,8 +577,6 @@ // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -// Can only be enabled for cores which support unaligned accesses. -// bool ARMParallelDSP::MatchSMLAD(Function &F) { BasicBlock *Header = L->getHeader(); LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n"; @@ -569,11 +590,25 @@ ParallelMACList MACCandidates = MatchParallelMACs(R); if (!SetMemoryLocations(MACCandidates)) continue; - Instructions Aliases = AliasCandidates(Header, MACCandidates); - if (AreAliased(AA, Aliases, MACCandidates)) - continue; - PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates); - Changed = InsertParallelMACs(R, PMACPairs) || Changed; + R.MACCandidates = MACCandidates; + + LLVM_DEBUG(dbgs() << "MAC candidates:\n"; + for (auto &M : R.MACCandidates) + M.Mul->dump(); + dbgs() << "\n";); + } + + // Collect all instructions that may read or write memory. Our alias + // analysis checks bail out if any of these instructions aliases with an + // instruction from the MAC-chain. + Instructions Reads, Writes; + AliasCandidates(Header, Reads, Writes); + + for (auto &R : Reductions) { + if (AreAliased(AA, Reads, Writes, R.MACCandidates)) + return false; + PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates); + Changed |= InsertParallelMACs(R, PMACPairs); } LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); Index: llvm/trunk/test/CodeGen/ARM/smlad0.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/smlad0.ll +++ llvm/trunk/test/CodeGen/ARM/smlad0.ll @@ -5,17 +5,20 @@ ; ; Check DSP extension: ; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 -mattr=-dsp < %s -arm-parallel-dsp -S | FileCheck %s --check-prefix=CHECK-UNSUPPORTED + +define dso_local i32 @OneReduction(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; +; CHECK-LABEL: @OneReduction ; CHECK: %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] ; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* ; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 ; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026) +; CHECK-NOT: call i32 @llvm.arm.smlad ; ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad ; -define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -30,7 +33,9 @@ ret i32 %mac1.0.lcssa for.body: +; One reduction statement here: %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 %0 = load i16, i16* %arrayidx, align 2 @@ -55,3 +60,73 @@ %exitcond = icmp ne i32 %add, %arg br i1 %exitcond, label %for.body, label %for.cond.cleanup } + +define dso_local arm_aapcs_vfpcc i32 @TwoReductions(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; +; CHECK-LABEL: @TwoReductions +; +; CHECK: %mac1{{\.}}058 = phi i32 [ [[V10:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] +; CHECK: %mac2{{\.}}057 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] +; CHECK: [[V10]] = call i32 @llvm.arm.smlad(i32 %{{.*}}, i32 %{{.*}}, i32 %mac1{{\.}}058) +; CHECK: [[V17]] = call i32 @llvm.arm.smlad(i32 %{{.*}}, i32 %{{.*}}, i32 %mac2{{\.}}057) +; CHECK-NOT: call i32 @llvm.arm.smlad +; +entry: + %cmp55 = icmp sgt i32 %arg, 0 + br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup + +for.cond.cleanup: + %mac2.0.lcssa = phi i32 [ 0, %entry ], [ %add28, %for.body ] + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.body ] + %add30 = add nsw i32 %mac1.0.lcssa, %mac2.0.lcssa + ret i32 %add30 + +for.body.preheader: + br label %for.body + +for.body: +; And two reduction statements here: + %mac1.058 = phi i32 [ %add16, %for.body ], [ 0, %for.body.preheader ] + %mac2.057 = phi i32 [ %add28, %for.body ], [ 0, %for.body.preheader ] + + %i.056 = phi i32 [ %add29, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.056 + %0 = load i16, i16* %arrayidx, align 2 + %add1 = or i32 %i.056, 1 + %arrayidx2 = getelementptr inbounds i16, i16* %arg3, i32 %add1 + %1 = load i16, i16* %arrayidx2, align 2 + %add3 = or i32 %i.056, 2 + %arrayidx4 = getelementptr inbounds i16, i16* %arg3, i32 %add3 + %2 = load i16, i16* %arrayidx4, align 2 + + %add5 = or i32 %i.056, 3 + %arrayidx6 = getelementptr inbounds i16, i16* %arg3, i32 %add5 + %3 = load i16, i16* %arrayidx6, align 2 + %arrayidx8 = getelementptr inbounds i16, i16* %arg2, i32 %i.056 + %4 = load i16, i16* %arrayidx8, align 2 + %conv = sext i16 %4 to i32 + %conv9 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv9 + %arrayidx11 = getelementptr inbounds i16, i16* %arg2, i32 %add1 + %5 = load i16, i16* %arrayidx11, align 2 + %conv12 = sext i16 %5 to i32 + %conv13 = sext i16 %1 to i32 + %mul14 = mul nsw i32 %conv12, %conv13 + %add15 = add i32 %mul, %mac1.058 + %add16 = add i32 %add15, %mul14 + %arrayidx18 = getelementptr inbounds i16, i16* %arg2, i32 %add3 + %6 = load i16, i16* %arrayidx18, align 2 + %conv19 = sext i16 %6 to i32 + %conv20 = sext i16 %2 to i32 + %mul21 = mul nsw i32 %conv19, %conv20 + %arrayidx23 = getelementptr inbounds i16, i16* %arg2, i32 %add5 + %7 = load i16, i16* %arrayidx23, align 2 + %conv24 = sext i16 %7 to i32 + %conv25 = sext i16 %3 to i32 + %mul26 = mul nsw i32 %conv24, %conv25 + %add27 = add i32 %mul21, %mac2.057 + %add28 = add i32 %add27, %mul26 + %add29 = add nuw nsw i32 %i.056, 4 + %cmp = icmp slt i32 %add29, %arg + br i1 %cmp, label %for.body, label %for.cond.cleanup +}