Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -613,7 +613,6 @@ return !R.getMulPairs().empty(); } - void ARMParallelDSP::InsertParallelMACs(Reduction &R) { auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, @@ -633,30 +632,45 @@ Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; }; - Instruction *InsertAfter = R.getRoot(); + // Return the instruction after the dominated instruction. + auto GetInsertPoint = [this](Value *A, Value *B) { + assert(isa(A) || isa(B) && + "expected at least one instruction"); + + Value *V = nullptr; + if (!isa(A)) + V = B; + else if (!isa(B)) + V = A; + else + V = DT->dominates(cast(A), cast(B)) ? B : A; + + return &*++BasicBlock::iterator(cast(V)); + }; + Value *Acc = R.getAccumulator(); // For any muls that were discovered but not paired, accumulate their values // as before. - IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + IRBuilder Builder(R.getRoot()->getParent()); MulCandList &MulCands = R.getMuls(); for (auto &MulCand : MulCands) { if (MulCand->Paired) continue; - Value *Mul = MulCand->Root; + Instruction *Mul = cast(MulCand->Root); LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); if (R.getType() != Mul->getType()) { assert(R.is64Bit() && "expected 64-bit result"); - Mul = Builder.CreateSExt(Mul, R.getType()); + Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); + Mul = cast(Builder.CreateSExt(Mul, R.getRoot()->getType())); } if (!Acc) { @@ -664,8 +678,8 @@ continue; } + Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); Acc = Builder.CreateAdd(Mul, Acc); - InsertAfter = cast(Acc); } if (!Acc) { @@ -677,6 +691,11 @@ Acc = Builder.CreateSExt(Acc, R.getType()); } + // Roughly sort the mul pairs in their program order. + llvm::sort(R.getMulPairs(), [this](auto &PairA, auto &PairB) { + return DT->dominates(PairA.first->Root, PairB.first->Root); + }); + IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { MulCandidate *LHSMul = Pair.first; @@ -688,8 +707,9 @@ LoadInst *WideRHS = WideLoads.count(BaseRHS) ? WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); + Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); + InsertAfter = GetInsertPoint(InsertAfter, Acc); Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); - InsertAfter = cast(Acc); } R.UpdateRoot(cast(Acc)); } Index: test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -0,0 +1,148 @@ +; RUN: opt -S -mtriple=armv7-a -arm-parallel-dsp -dce %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT +; RUN: llc -mtriple=thumbv7em -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC + +; TODO: Think we should be able to use smlsdx/smlsldx here. + +; CHECK-LABEL: complex_dot_prod + +; CHECK-OPT: [[ADDR_A:%[^ ]+]] = bitcast i16* %pSrcA to i32* +; CHECK-OPT: [[A:%[^ ]+]] = load i32, i32* [[ADDR_A]], align 2 +; CHECK-OPT: [[ADDR_A_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 2 +; CHECK-OPT: [[ADDR_B:%[^ ]+]] = bitcast i16* %pSrcB to i32* +; CHECK-OPT: [[B:%[^ ]+]] = load i32, i32* [[ADDR_B]], align 2 +; CHECK-OPT: [[ACC0:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A]], i32 [[B]], i64 0) +; CHECK-OPT: [[ADDR_B_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 2 +; CHECK-OPT: [[CAST_ADDR_A_2:%[^ ]+]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-OPT: [[A_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_2]], align 2 +; CHECK-OPT: [[ADDR_A_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 4 +; CHECK-OPT: [[CAST_ADDR_B_2:%[^ ]+]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-OPT: [[B_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_2]], align 2 +; CHECK-OPT: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_2]], i32 [[B_2]], i64 [[ACC0]]) +; CHECK-OPT: [[ADDR_B_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 4 +; CHECK-OPT: [[CAST_ADDR_A_4:%[^ ]+]] = bitcast i16* [[ADDR_A_4]] to i32* +; CHECK-OPT: [[A_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_4]], align 2 +; CHECK-OPT: [[ADDR_A_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 6 +; CHECK-OPT: [[CAST_ADDR_B_4:%[^ ]+]] = bitcast i16* [[ADDR_B_4]] to i32* +; CHECK-OPT: [[B_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_4]], align 2 +; CHECK-OPT: [[ACC2:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_4]], i32 [[B_4]], i64 [[ACC1]]) +; CHECK-OPT: [[ADDR_B_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 6 +; CHECK-OPT: [[CAST_ADDR_A_6:%[^ ]+]] = bitcast i16* [[ADDR_A_6]] to i32* +; CHECK-OPT: [[A_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_6]], align 2 +; CHECK-OPT: [[CAST_ADDR_B_6:%[^ ]+]] = bitcast i16* [[ADDR_B_6]] to i32* +; CHECK-OPT: [[B_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_6]], align 2 +; CHECK-OPT: call i64 @llvm.arm.smlaldx(i32 [[A_6]], i32 [[B_6]], i64 [[ACC2]]) + +; CHECK-LLC: smul +; CHECK-LLC: smul +; CHECK-LLC: smul +; CHECK-LLC: smlal +; CHECK-LLC: smul +; CHECK-LLC: smlal +; CHECK-LLC: smul +; CHECK-LLC: smlal +; CHECK-LLC: smlaldx +; CHECK-LLC: smlaldx +; CHECK-LLC: smlaldx +; CHECK-LLC: smlaldx +define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) { +entry: + %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA, i32 1 + %0 = load i16, i16* %pSrcA, align 2 + %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcA, i32 2 + %1 = load i16, i16* %incdec.ptr, align 2 + %incdec.ptr2 = getelementptr inbounds i16, i16* %pSrcB, i32 1 + %2 = load i16, i16* %pSrcB, align 2 + %incdec.ptr3 = getelementptr inbounds i16, i16* %pSrcB, i32 2 + %3 = load i16, i16* %incdec.ptr2, align 2 + %conv = sext i16 %0 to i32 + %conv4 = sext i16 %2 to i32 + %mul = mul nsw i32 %conv4, %conv + %conv5 = sext i32 %mul to i64 + %conv7 = sext i16 %3 to i32 + %mul8 = mul nsw i32 %conv7, %conv + %conv9 = sext i32 %mul8 to i64 + %conv11 = sext i16 %1 to i32 + %mul13 = mul nsw i32 %conv7, %conv11 + %conv14 = sext i32 %mul13 to i64 + %sub = sub nsw i64 %conv5, %conv14 + %mul17 = mul nsw i32 %conv4, %conv11 + %conv18 = sext i32 %mul17 to i64 + %add19 = add nsw i64 %conv9, %conv18 + %incdec.ptr20 = getelementptr inbounds i16, i16* %pSrcA, i32 3 + %4 = load i16, i16* %incdec.ptr1, align 2 + %incdec.ptr21 = getelementptr inbounds i16, i16* %pSrcA, i32 4 + %5 = load i16, i16* %incdec.ptr20, align 2 + %incdec.ptr22 = getelementptr inbounds i16, i16* %pSrcB, i32 3 + %6 = load i16, i16* %incdec.ptr3, align 2 + %incdec.ptr23 = getelementptr inbounds i16, i16* %pSrcB, i32 4 + %7 = load i16, i16* %incdec.ptr22, align 2 + %conv24 = sext i16 %4 to i32 + %conv25 = sext i16 %6 to i32 + %mul26 = mul nsw i32 %conv25, %conv24 + %conv27 = sext i32 %mul26 to i64 + %add28 = add nsw i64 %sub, %conv27 + %conv30 = sext i16 %7 to i32 + %mul31 = mul nsw i32 %conv30, %conv24 + %conv32 = sext i32 %mul31 to i64 + %conv34 = sext i16 %5 to i32 + %mul36 = mul nsw i32 %conv30, %conv34 + %conv37 = sext i32 %mul36 to i64 + %sub38 = sub nsw i64 %add28, %conv37 + %mul41 = mul nsw i32 %conv25, %conv34 + %conv42 = sext i32 %mul41 to i64 + %incdec.ptr44 = getelementptr inbounds i16, i16* %pSrcA, i32 5 + %8 = load i16, i16* %incdec.ptr21, align 2 + %incdec.ptr45 = getelementptr inbounds i16, i16* %pSrcA, i32 6 + %9 = load i16, i16* %incdec.ptr44, align 2 + %incdec.ptr46 = getelementptr inbounds i16, i16* %pSrcB, i32 5 + %10 = load i16, i16* %incdec.ptr23, align 2 + %incdec.ptr47 = getelementptr inbounds i16, i16* %pSrcB, i32 6 + %11 = load i16, i16* %incdec.ptr46, align 2 + %conv48 = sext i16 %8 to i32 + %conv49 = sext i16 %10 to i32 + %mul50 = mul nsw i32 %conv49, %conv48 + %conv51 = sext i32 %mul50 to i64 + %add52 = add nsw i64 %sub38, %conv51 + %conv54 = sext i16 %11 to i32 + %mul55 = mul nsw i32 %conv54, %conv48 + %conv56 = sext i32 %mul55 to i64 + %conv58 = sext i16 %9 to i32 + %mul60 = mul nsw i32 %conv54, %conv58 + %conv61 = sext i32 %mul60 to i64 + %sub62 = sub nsw i64 %add52, %conv61 + %mul65 = mul nsw i32 %conv49, %conv58 + %conv66 = sext i32 %mul65 to i64 + %incdec.ptr68 = getelementptr inbounds i16, i16* %pSrcA, i32 7 + %12 = load i16, i16* %incdec.ptr45, align 2 + %13 = load i16, i16* %incdec.ptr68, align 2 + %incdec.ptr70 = getelementptr inbounds i16, i16* %pSrcB, i32 7 + %14 = load i16, i16* %incdec.ptr47, align 2 + %15 = load i16, i16* %incdec.ptr70, align 2 + %conv72 = sext i16 %12 to i32 + %conv73 = sext i16 %14 to i32 + %mul74 = mul nsw i32 %conv73, %conv72 + %conv75 = sext i32 %mul74 to i64 + %add76 = add nsw i64 %sub62, %conv75 + %conv78 = sext i16 %15 to i32 + %mul79 = mul nsw i32 %conv78, %conv72 + %conv80 = sext i32 %mul79 to i64 + %conv82 = sext i16 %13 to i32 + %mul84 = mul nsw i32 %conv78, %conv82 + %conv85 = sext i32 %mul84 to i64 + %sub86 = sub nsw i64 %add76, %conv85 + %mul89 = mul nsw i32 %conv73, %conv82 + %conv90 = sext i32 %mul89 to i64 + %add33 = add nsw i64 %add19, %conv42 + %add43 = add nsw i64 %add33, %conv32 + %add57 = add nsw i64 %add43, %conv66 + %add67 = add nsw i64 %add57, %conv56 + %add81 = add nsw i64 %add67, %conv90 + %add91 = add nsw i64 %add81, %conv80 + %16 = lshr i64 %sub86, 6 + %conv92 = trunc i64 %16 to i32 + store i32 %conv92, i32* %realResult, align 4 + %17 = lshr i64 %add91, 6 + %conv94 = trunc i64 %17 to i32 + store i32 %conv94, i32* %imagResult, align 4 + ret void +} Index: test/CodeGen/ARM/ParallelDSP/exchange.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/exchange.ll +++ test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -105,10 +105,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -144,10 +144,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -184,10 +184,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { entry: @@ -225,10 +225,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { entry: @@ -306,8 +306,8 @@ ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0) -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]]) +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 Index: test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -11,14 +11,14 @@ ; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2 ; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32* ; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2 +; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0) ; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2 ; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32* ; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2 ; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 ; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* ; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0) -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]]) +; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]]) ; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4 define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { Index: test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -20,8 +20,8 @@ ; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: bne .LBB0_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 @@ -210,8 +210,8 @@ define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: mul_top_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB2_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader @@ -222,21 +222,21 @@ ; CHECK-LE-NEXT: .p2align 2 ; CHECK-LE-NEXT: .LBB2_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: ldr lr, [r3, #2]! -; CHECK-LE-NEXT: asrs r5, r4, #16 -; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: asr.w r4, r4, #16 +; CHECK-LE-NEXT: mul r1, r4, r1 ; CHECK-LE-NEXT: bne .LBB2_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB2_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: mul_top_user: ; CHECK-BE: @ %bb.0: @ %entry @@ -313,8 +313,8 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: and_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB3_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader @@ -327,19 +327,19 @@ ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr r2, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [lr, #2]! -; CHECK-LE-NEXT: uxth r5, r2 -; CHECK-LE-NEXT: smlad r12, r4, r2, r12 ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, r2, r12 +; CHECK-LE-NEXT: uxth r2, r2 +; CHECK-LE-NEXT: mul r1, r2, r1 ; CHECK-LE-NEXT: bne .LBB3_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB3_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: and_user: ; CHECK-BE: @ %bb.0: @ %entry Index: test/CodeGen/ARM/ParallelDSP/overlapping.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -7,12 +7,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -51,12 +51,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -133,13 +133,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 @@ -178,13 +179,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 Index: test/CodeGen/ARM/ParallelDSP/pr43073.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -15,14 +15,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -88,14 +88,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -157,14 +157,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -238,13 +238,13 @@ ; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2 ; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32 ; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] +; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 +; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32* ; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2 ; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 ; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32* ; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 -; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 -; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -10,10 +10,10 @@ ; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2 ; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32* ; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 +; CHECK: [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) ; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* ; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) -; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]]) ; ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad Index: test/CodeGen/ARM/ParallelDSP/smladx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smladx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smladx-1.ll @@ -12,12 +12,13 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC0]]) + ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC1]]) ; CHECK-NOT: call i32 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad @@ -130,6 +131,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -138,8 +140,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 Index: test/MC/AsmParser/preserve-comments-crlf.s =================================================================== --- test/MC/AsmParser/preserve-comments-crlf.s +++ test/MC/AsmParser/preserve-comments-crlf.s @@ -1,13 +1,13 @@ - #RUN: llvm-mc -preserve-comments -n -triple i386-linux-gnu < %s > %t - #RUN: diff -b %s %t - .text - -foo: #Comment here - #comment here - nop - #if DIRECTIVE COMMENT - ## WHOLE LINE COMMENT - cmpl $196, %eax ## EOL COMMENT - #endif - .ident "clang version 3.9.0" - .section ".note.GNU-stack","",@progbits + #RUN: llvm-mc -preserve-comments -n -triple i386-linux-gnu < %s > %t + #RUN: diff -b %s %t + .text + +foo: #Comment here + #comment here + nop + #if DIRECTIVE COMMENT + ## WHOLE LINE COMMENT + cmpl $196, %eax ## EOL COMMENT + #endif + .ident "clang version 3.9.0" + .section ".note.GNU-stack","",@progbits