Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1285,6 +1285,7 @@ // to be recomputed. LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + MachineInstr *VCMP = nullptr; // The VPT block has a non-uniform predicate but it uses a vpst and its // entry is guarded only by a vctp, which means we: // - Need to remove the original vpst. @@ -1295,8 +1296,13 @@ // TODO: We could be producing more VPT blocks than necessary and could // fold the newly created one into a proceeding one. for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()), - E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) + E = ++MachineBasicBlock::iterator(Divergent->MI); + I != E; ++I) { + // Find the VCMP preceding the VPST + if (I->getOpcode() == ARM::MVE_VCMPs8 && ++I == E) + VCMP = &*(--I); RemovePredicate(&*I); + } unsigned Size = 0; auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); @@ -1307,13 +1313,32 @@ ++Size; ++I; } - // Create a VPST (with a null mask for now, we'll recompute it later). - MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, - InsertAt->getDebugLoc(), - TII->get(ARM::MVE_VPST)); - MIB.addImm(0); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); - LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + MachineInstrBuilder MIB; + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " + << *Block.getPredicateThen()); + if (VCMP) { + // Combine the VPST and VCMP into a VPT + MIB = + BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(), + TII->get(VCMPOpcodeToVPT(VCMP->getOpcode()))); + MIB.addImm(ARMVCC::Then); + // Register one + MIB.add(VCMP->getOperand(1)); + // Register two + MIB.add(VCMP->getOperand(2)); + // The comparison code, e.g. ge, eq, lt + MIB.add(VCMP->getOperand(3)); + LLVM_DEBUG(dbgs() + << "ARM Loops: Combining with VCMP to VPT: " << *MIB); + LoLoop.ToRemove.insert(VCMP); + } else { + // Create a VPST (with a null mask for now, we'll recompute it later) + // or a VPT in case there was a VCMP right before it + MIB = BuildMI(*InsertAt->getParent(), InsertAt, + InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST)); + MIB.addImm(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + } LoLoop.ToRemove.insert(Block.getPredicateThen()); LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -O3 -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) { +; CHECK-LABEL: vcmp_vpst_combination: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.i8 q0, #0x7f +; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vpt.s8 ge, q0, q1 +; CHECK-NEXT: vmovt q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: pop {r7, pc} +entry: + %conv = zext i16 %blockSize to i32 + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 0, i32 1) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + br label %do.body + +do.body: ; preds = %do.body, %entry + %indexVec.0 = phi <16 x i8> [ %1, %entry ], [ %add, %do.body ] + %curExtremIdxVec.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %6, %do.body ] + %curExtremValVec.0 = phi <16 x i8> [ , %entry ], [ %6, %do.body ] + %blkCnt.0 = phi i32 [ %conv, %entry ], [ %sub2, %do.body ] + %2 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %blkCnt.0) + %3 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %pSrc, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + %4 = icmp sle <16 x i8> %3, %curExtremValVec.0 + %5 = and <16 x i1> %4, %2 + %6 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %3, <16 x i8> %3, <16 x i1> %5, <16 x i8> %curExtremValVec.0) + %add = add <16 x i8> %indexVec.0, + %sub2 = add nsw i32 %blkCnt.0, -16 + %cmp = icmp sgt i32 %blkCnt.0, 16 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret <16 x i8> %6 +} + +declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32) + +declare <16 x i1> @llvm.arm.mve.vctp8(i32) + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + +declare <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)