diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3296,6 +3296,49 @@ } break; } + case Intrinsic::arm_mve_minv_u: + case Intrinsic::arm_mve_minv_s: { + unsigned ScalarWidth = II->getArgOperand(1) + ->getType() + ->getVectorElementType() + ->getScalarSizeInBits(); + + bool Modified = false; + + KnownBits ScalarKnown(32); + if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, ScalarWidth), + ScalarKnown, 0)) + Modified = true; + if (ScalarWidth < 32 && !II->getMetadata(LLVMContext::MD_range)) { + uint32_t Lo = 0, Hi = (uint32_t)1 << ScalarWidth; + if (IID == Intrinsic::arm_mve_minv_s) { + uint32_t Offset = Hi >> 1; + Lo -= Offset; + Hi -= Offset; + } + Type *IntTy32 = Type::getInt32Ty(II->getContext()); + Metadata *M[] = {ConstantAsMetadata::get(ConstantInt::get(IntTy32, Lo)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, Hi))}; + II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); + Modified = true; + } + if (Modified) + return II; + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + Value *CarryArg = II->getArgOperand(CarryOp); + unsigned CarryWidth = CarryArg->getType()->getScalarSizeInBits(); + + KnownBits CarryKnown(32); + if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(CarryWidth, 29), + CarryKnown, 0)) + return II; + break; + } case Intrinsic::amdgcn_rcp: { Value *Src = II->getArgOperand(0); diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll @@ -0,0 +1,50 @@ +; RUN: opt -instcombine -S %s | FileCheck --check-prefix=IR %s +; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -O3 -o - | FileCheck --check-prefix=ASM %s + +%struct.foo = type { [2 x <4 x i32>] } + +; Function Attrs: noinline nounwind optnone +define arm_aapcs_vfpcc i32 @test_vadciq_multiple(%struct.foo %a, %struct.foo %b, i32 %carry) { +entry: + %a.0 = extractvalue %struct.foo %a, 0, 0 + %a.1 = extractvalue %struct.foo %a, 0, 1 + %b.0 = extractvalue %struct.foo %b, 0, 0 + %b.1 = extractvalue %struct.foo %b, 0, 1 + + %fpscr.in.0 = shl i32 %carry, 29 + %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0) + %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1 + %shifted.out.0 = lshr i32 %fpscr.out.0, 29 + %carry.out.0 = and i32 1, %shifted.out.0 + %fpscr.in.1 = shl i32 %carry.out.0, 29 + %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.in.1) + %fpscr.out.1 = extractvalue { <4 x i32>, i32 } %outpair.1, 1 + %shifted.out.1 = lshr i32 %fpscr.out.1, 29 + %carry.out.1 = and i32 1, %shifted.out.1 + ret i32 %carry.out.1 +} + +declare { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32>, <4 x i32>, i32) + +; Expect the transformation in between the two intrinsics, where the +; fpscr-formatted output value is turned back into just the carry bit +; at bit 0 and then back again for the next call, to be optimized away +; completely in InstCombine, so that the FPSCR output from one +; intrinsic is passed straight on to the next: + +; IR: %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0) +; IR: %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1 +; IR: %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.out.0) + +; And this is the assembly language we expect at the end of it, with +; the two vadc.i32 instructions right next to each other, and the +; second one implicitly reusing the FPSCR written by the first. + +; ASM: test_vadciq_multiple: +; ASM: lsls r0, r0, #29 +; ASM-NEXT: vmsr fpscr_nzcvqc, r0 +; ASM-NEXT: vadc.i32 q0, q0, q2 +; ASM-NEXT: vadc.i32 q0, q1, q3 +; ASM-NEXT: vmrs r0, fpscr_nzcvqc +; ASM-NEXT: ubfx r0, r0, #29, #1 +; ASM-NEXT: bx lr