Index: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp @@ -68,7 +68,7 @@ } LoadInst *getBaseLoad() const { - return cast(LHS); + return VecLd.front(); } }; @@ -696,13 +696,15 @@ // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - BaseSExt->setOperand(0, Bottom); + Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); + BaseSExt->replaceAllUsesWith(NewBaseSExt); IntegerType *OffsetTy = cast(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - OffsetSExt->setOperand(0, Trunc); + Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); + OffsetSExt->replaceAllUsesWith(NewOffsetSExt); WideLoads.emplace(std::make_pair(Base, make_unique(Loads, WideLoad))); Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -5,7 +5,7 @@ ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) define i32 @single_block(i16* %a, i16* %b, i32 %acc) { entry: %ld.a.0 = load i16, i16* %a @@ -30,7 +30,7 @@ ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { entry: %ld.a.0 = load i16, i16* %a Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/exchange.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -0,0 +1,329 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: exchange_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) +define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.3, %sext.b.1 + %mul.3 = mul i32 %sext.a.2, %sext.b.0 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) +define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; TODO: Why aren't two intrinsics generated? +; CHECK-LABEL: exchange_multi_use_3 +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 +define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; TODO: Why isn't smladx generated too? +; CHECK-LABEL: exchange_multi_use_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add.1 = add i32 %mul.2, %mul.3 + %add = add i32 %mul.0, %mul.1 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; CHECK-LABEL: exchange_swap +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.0, %sext.a.1 + %mul.1 = mul i32 %sext.b.1, %sext.a.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/overlapping.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -0,0 +1,161 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: overlap_1 +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_3 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.1 + %mul.3 = mul i32 %sext.a.3, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_4 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %mul.3 = mul i32 %sext.b.1, %sext.a.3 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +}