Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -36,7 +36,10 @@ #define DEBUG_TYPE "arm-parallel-dsp" -STATISTIC(NumSMLAD , "Number of smlad instructions generated"); +STATISTIC(NumSMLAD , "Number of smla*d instructions generated"); +STATISTIC(NumSMLADX, "Number of smla*dx instructions generated"); +STATISTIC(NumSMUAD, "Number of smuad instructions generated"); +STATISTIC(NumSMUADX, "Number of smuadx instructions generated"); static cl::opt DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), @@ -152,6 +155,7 @@ /// Return the add instruction which is the root of the reduction. Instruction *getRoot() { return Root; } + /// Return whether this reduction accumulating into a 64-bit result. bool is64Bit() const { return Root->getType()->isIntegerTy(64); } /// Return the incoming value to be accumulated. This maybe null. @@ -621,22 +625,41 @@ Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; - if (Exchange) + if (Exchange) { SMLAD = Acc->getType()->isIntegerTy(32) ? Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); - else + ++NumSMLADX; + } else { SMLAD = Acc->getType()->isIntegerTy(32) ? Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); + ++NumSMLAD; + } IRBuilder Builder(InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); - NumSMLAD++; return Call; }; + auto CreateSMUAD = [&](LoadInst *LHS, LoadInst *RHS, bool Exchange, + Instruction *InsertAfter) { + IRBuilder Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); + Value* Args[] = { LHS, RHS }; + Function *SMUAD = Exchange ? + Intrinsic::getDeclaration(M, Intrinsic::arm_smuadx) : + Intrinsic::getDeclaration(M, Intrinsic::arm_smuad); + + if (Exchange) + ++NumSMUADX; + else + ++NumSMUAD; + + return Builder.CreateCall(SMUAD, Args); + }; + Instruction *InsertAfter = R.getRoot(); Value *Acc = R.getAccumulator(); @@ -666,10 +689,10 @@ InsertAfter = cast(Acc); } - if (!Acc) - Acc = R.is64Bit() ? - ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) : - ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); + // With no accumulator input, we can generate a 32-bit smuad or we + // need a 64-bit zero input for smlald. + if (!Acc && R.is64Bit()) + Acc = ConstantInt::get(IntegerType::get(M->getContext(), 64), 0); IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { @@ -682,7 +705,10 @@ LoadInst *WideRHS = WideLoads.count(BaseRHS) ? WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); - Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); + if (!Acc) + Acc = CreateSMUAD(WideLHS, WideRHS, RHSMul->Exchange, InsertAfter); + else + Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); InsertAfter = cast(Acc); } R.UpdateRoot(cast(Acc)); Index: test/CodeGen/ARM/ParallelDSP/blocks.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/blocks.ll +++ test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -57,7 +57,7 @@ ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) +; CHECK: call i32 @llvm.arm.smuad(i32 [[A]], i32 [[B]]) define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { entry: %ld.a.0 = load i16, i16* %a Index: test/CodeGen/ARM/ParallelDSP/exchange.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/exchange.ll +++ test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -267,7 +267,7 @@ ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] ; CHECK-NOT: call i32 @llvm.arm.smlad -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smuadx(i32 [[LD_B]], i32 [[LD_A_2]]) define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 @@ -345,7 +345,7 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smuad(i32 [[LD_A]], i32 [[LD_B]]) ; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { entry: Index: test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -17,9 +17,9 @@ ; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 ; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* ; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0) -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]]) -; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4 +; CHECK: [[SMUAD:%[^ ]+]] = call i32 @llvm.arm.smuad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]]) +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMUAD]]) +; CHECK: store i32 [[SMLAD]], i32* %arrayidx, align 4 define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { entry: Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -18,7 +18,7 @@ ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad ; -; CHECK: 2 arm-parallel-dsp - Number of smlad instructions generated +; CHECK: 2 arm-parallel-dsp - Number of smla*d instructions generated ; define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: Index: test/CodeGen/ARM/ParallelDSP/smuad.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/smuad.ll @@ -0,0 +1,142 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; TODO: We should generate a smuad as the first intrinsic. The reason why we +; don't is because the reduction searches until no more adds are found and +; we only query the type of the final result. +; CHECK-LABEL: smuad_smlald +; CHECK: call i64 @llvm.arm.smlald +; CHECK: call i64 @llvm.arm.smlald +define i64 @smuad_smlald(i16* %a, i16* %b) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %acc = sext i32 %add to i64 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %sext.a.2 = sext i16 %ld.a.2 to i64 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.b.2 = sext i16 %ld.b.2 to i64 + %mul.2 = mul i64 %sext.a.2, %sext.b.2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %addr.b.3 = getelementptr i16, i16* %b, i32 3 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.3 = sext i16 %ld.a.3 to i64 + %ld.b.3 = load i16, i16* %addr.b.3 + %sext.b.3 = sext i16 %ld.b.3 to i64 + %mul.3 = mul i64 %sext.a.3, %sext.b.3 + %add.1 = add i64 %mul.2, %mul.3 + %res = add i64 %add.1, %acc + ret i64 %res +} + +; CHECK-LABEL: single_block_smuad +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK call i32 @llvm.arm.smuad(i32 [[A]], i32 [[B]]) +define i32 @single_block_smuad(i16* %a, i16* %b) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + ret i32 %add +} + +; CHECK-LABEL: single_block_smuadx +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK call i32 @llvm.arm.smuadx(i32 [[A]], i32 [[B]]) +define i32 @single_block_smuadx(i16* %a, i16* %b) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + ret i32 %add +} + +; CHECK-LABEL: not_smuad +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK call i32 @llvm.arm.smlald(i32 [[B]], i32 [[A]], i64 0) +define i64 @not_smuad(i16* %a, i16* %b) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.1 + %mul.1 = mul i32 %sext.a.0, %sext.b.0 + %sext.mul.0 = sext i32 %mul.0 to i64 + %sext.mul.1 = sext i32 %mul.1 to i64 + %add = add i64 %sext.mul.0, %sext.mul.1 + ret i64 %add +} + +; CHECK-LABEL: not_smuadx +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK call i32 @llvm.arm.smlaldx(i32 [[B]], i32 [[A]], i64 0) +define i64 @not_smuadx(i16* %a, i16* %b) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %sext.mul.0 = sext i32 %mul.0 to i64 + %sext.mul.1 = sext i32 %mul.1 to i64 + %add = add i64 %sext.mul.0, %sext.mul.1 + ret i64 %add +}