Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" @@ -346,6 +347,7 @@ SmallVector Writes; LoadPairs.clear(); WideLoads.clear(); + OrderedBasicBlock OrderedBB(BB); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -374,7 +376,7 @@ if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), ModRefInfo::ModRef))) continue; - if (DT->dominates(Write, Read)) + if (OrderedBB.dominates(Write, Read)) RAWDeps[Read].insert(Write); } } @@ -382,8 +384,8 @@ // Check whether there's not a write between the two loads which would // prevent them from being safely merged. auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { - LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; - LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; + LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base; if (RAWDeps.count(Dominated)) { InstSet &WritesBefore = RAWDeps[Dominated]; @@ -391,7 +393,7 @@ for (auto Before : WritesBefore) { // We can't move the second load backward, past a write, to merge // with the first load. - if (DT->dominates(Dominator, Before)) + if (OrderedBB.dominates(Dominator, Before)) return false; } } @@ -613,7 +615,6 @@ return !R.getMulPairs().empty(); } - void ARMParallelDSP::InsertParallelMACs(Reduction &R) { auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, @@ -633,30 +634,45 @@ Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; }; - Instruction *InsertAfter = R.getRoot(); + // Return the instruction after the dominated instruction. + auto GetInsertPoint = [this](Value *A, Value *B) { + assert(isa(A) || isa(B) && + "expected at least one instruction"); + + Value *V = nullptr; + if (!isa(A)) + V = B; + else if (!isa(B)) + V = A; + else + V = DT->dominates(cast(A), cast(B)) ? B : A; + + return &*++BasicBlock::iterator(cast(V)); + }; + Value *Acc = R.getAccumulator(); // For any muls that were discovered but not paired, accumulate their values // as before. - IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + IRBuilder Builder(R.getRoot()->getParent()); MulCandList &MulCands = R.getMuls(); for (auto &MulCand : MulCands) { if (MulCand->Paired) continue; - Value *Mul = MulCand->Root; + Instruction *Mul = cast(MulCand->Root); LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); if (R.getType() != Mul->getType()) { assert(R.is64Bit() && "expected 64-bit result"); - Mul = Builder.CreateSExt(Mul, R.getType()); + Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); + Mul = cast(Builder.CreateSExt(Mul, R.getRoot()->getType())); } if (!Acc) { @@ -664,8 +680,8 @@ continue; } + Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); Acc = Builder.CreateAdd(Mul, Acc); - InsertAfter = cast(Acc); } if (!Acc) { @@ -677,6 +693,14 @@ Acc = Builder.CreateSExt(Acc, R.getType()); } + // Roughly sort the mul pairs in their program order. + OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); + llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { + const Instruction *A = PairA.first->Root; + const Instruction *B = PairB.first->Root; + return OrderedBB.dominates(A, B); + }); + IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { MulCandidate *LHSMul = Pair.first; @@ -688,8 +712,9 @@ LoadInst *WideRHS = WideLoads.count(BaseRHS) ? WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); + Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); + InsertAfter = GetInsertPoint(InsertAfter, Acc); Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); - InsertAfter = cast(Acc); } R.UpdateRoot(cast(Acc)); } Index: test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll +++ test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -1,21 +1,50 @@ -; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC +; RUN: opt -S -mtriple=armv7-a -arm-parallel-dsp -dce %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT ; TODO: Think we should be able to use smlsdx/smlsldx here. ; CHECK-LABEL: complex_dot_prod -; CHECK: smulbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlaldx -; CHECK: smlalbb -; CHECK: smlaldx -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smlaldx -; CHECK: smultt -; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-LLC: smlaldx +; CHECK-LLC: smulbb +; CHECK-LLC: smultt +; CHECK-LLC: smlalbb +; CHECK-LLC: smlaldx +; CHECK-LLC: smultt +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlalbb +; CHECK-LLC: smlaldx +; CHECK-LLC: smultt +; CHECK-LCC: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} + +; CHECK-OPT: [[ADDR_A:%[^ ]+]] = bitcast i16* %pSrcA to i32* +; CHECK-OPT: [[A:%[^ ]+]] = load i32, i32* [[ADDR_A]], align 2 +; CHECK-OPT: [[ADDR_A_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 2 +; CHECK-OPT: [[ADDR_B:%[^ ]+]] = bitcast i16* %pSrcB to i32* +; CHECK-OPT: [[B:%[^ ]+]] = load i32, i32* [[ADDR_B]], align 2 +; CHECK-OPT: [[ACC0:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A]], i32 [[B]], i64 0) +; CHECK-OPT: [[ADDR_B_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 2 +; CHECK-OPT: [[CAST_ADDR_A_2:%[^ ]+]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-OPT: [[A_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_2]], align 2 +; CHECK-OPT: [[ADDR_A_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 4 +; CHECK-OPT: [[CAST_ADDR_B_2:%[^ ]+]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-OPT: [[B_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_2]], align 2 +; CHECK-OPT: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_2]], i32 [[B_2]], i64 [[ACC0]]) +; CHECK-OPT: [[ADDR_B_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 4 +; CHECK-OPT: [[CAST_ADDR_A_4:%[^ ]+]] = bitcast i16* [[ADDR_A_4]] to i32* +; CHECK-OPT: [[A_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_4]], align 2 +; CHECK-OPT: [[ADDR_A_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 6 +; CHECK-OPT: [[CAST_ADDR_B_4:%[^ ]+]] = bitcast i16* [[ADDR_B_4]] to i32* +; CHECK-OPT: [[B_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_4]], align 2 +; CHECK-OPT: [[ACC2:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_4]], i32 [[B_4]], i64 [[ACC1]]) +; CHECK-OPT: [[ADDR_B_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 6 +; CHECK-OPT: [[CAST_ADDR_A_6:%[^ ]+]] = bitcast i16* [[ADDR_A_6]] to i32* +; CHECK-OPT: [[A_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_6]], align 2 +; CHECK-OPT: [[CAST_ADDR_B_6:%[^ ]+]] = bitcast i16* [[ADDR_B_6]] to i32* +; CHECK-OPT: [[B_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_6]], align 2 +; CHECK-OPT: call i64 @llvm.arm.smlaldx(i32 [[A_6]], i32 [[B_6]], i64 [[ACC2]]) + define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) { entry: %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA, i32 1 @@ -107,7 +136,7 @@ %conv85 = sext i32 %mul84 to i64 %sub86 = sub nsw i64 %add76, %conv85 %mul89 = mul nsw i32 %conv73, %conv82 - %conv90 = sext i32 %mul89 to i64 + %conv90 = sext i32 %mul89 to i64 %add81 = add nsw i64 %add67, %conv90 %add91 = add nsw i64 %add81, %conv80 %16 = lshr i64 %sub86, 6 Index: test/CodeGen/ARM/ParallelDSP/exchange.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/exchange.ll +++ test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -105,10 +105,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -144,10 +144,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -184,10 +184,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { entry: @@ -225,10 +225,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { entry: @@ -306,8 +306,8 @@ ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0) -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]]) +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 Index: test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -11,14 +11,14 @@ ; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2 ; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32* ; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2 +; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0) ; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2 ; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32* ; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2 ; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 ; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* ; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0) -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]]) +; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]]) ; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4 define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { Index: test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -20,8 +20,8 @@ ; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: bne .LBB0_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 @@ -210,8 +210,8 @@ define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: mul_top_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB2_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader @@ -222,21 +222,21 @@ ; CHECK-LE-NEXT: .p2align 2 ; CHECK-LE-NEXT: .LBB2_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: ldr lr, [r3, #2]! -; CHECK-LE-NEXT: asrs r5, r4, #16 -; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: asr.w r4, r4, #16 +; CHECK-LE-NEXT: mul r1, r4, r1 ; CHECK-LE-NEXT: bne .LBB2_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB2_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: mul_top_user: ; CHECK-BE: @ %bb.0: @ %entry @@ -313,8 +313,8 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: and_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB3_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader @@ -327,19 +327,19 @@ ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr r2, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [lr, #2]! -; CHECK-LE-NEXT: uxth r5, r2 -; CHECK-LE-NEXT: smlad r12, r4, r2, r12 ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, r2, r12 +; CHECK-LE-NEXT: uxth r2, r2 +; CHECK-LE-NEXT: mul r1, r2, r1 ; CHECK-LE-NEXT: bne .LBB3_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB3_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: and_user: ; CHECK-BE: @ %bb.0: @ %entry Index: test/CodeGen/ARM/ParallelDSP/overlapping.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -7,12 +7,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -51,12 +51,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -133,13 +133,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 @@ -178,13 +179,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 Index: test/CodeGen/ARM/ParallelDSP/pr43073.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -15,14 +15,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -88,14 +88,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -157,14 +157,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -238,13 +238,13 @@ ; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2 ; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32 ; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] +; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 +; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32* ; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2 ; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 ; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32* ; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 -; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 -; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -10,10 +10,10 @@ ; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2 ; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32* ; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 +; CHECK: [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) ; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* ; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) -; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]]) ; ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad Index: test/CodeGen/ARM/ParallelDSP/smladx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smladx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smladx-1.ll @@ -12,12 +12,13 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC0]]) + ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC1]]) ; CHECK-NOT: call i32 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad @@ -130,6 +131,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -138,8 +140,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -45,8 +45,7 @@ ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE-NOT: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: bne .LBB0_1 for.body: