Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5840,6 +5840,7 @@ let mayLoad = load; let mayStore = !eq(load,0); let hasSideEffects = 0; + let validForTailPredication = load; } // A parameter class used to encapsulate all the ways the writeback Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -769,7 +769,7 @@ // the false lanes are zeroed and here we're trying to track that those false // lanes remain zero, or where they change, the differences are masked away // by their user(s). - // All MVE loads and stores have to be predicated, so we know that any load + // All MVE stores have to be predicated, so we know that any predicate load // operands, or stored results are equivalent already. Other explicitly // predicated instructions will perform the same operation in the original // loop and the tail-predicated form too. Because of this, we can insert @@ -1025,8 +1025,8 @@ } // If the instruction is already explicitly predicated, then the conversion - // will be fine, but ensure that all memory operations are predicated. - return !IsUse && MI->mayLoadOrStore() ? false : true; + // will be fine, but ensure that all store operations are predicated. + return !IsUse && MI->mayStore() ? false : true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -6,26 +6,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: subs.w r12, r2, #8 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: csinv r3, r3, r12, pl -; CHECK-NEXT: add.w r12, r3, r2 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vmulht.s16 q2, q1, q1 -; CHECK-NEXT: vmulht.s16 q0, q0, q0 -; CHECK-NEXT: vqaddt.s16 q0, q0, q2 -; CHECK-NEXT: vshrt.s16 q0, q0, #1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r1], #16 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vmulh.s16 q2, q1, q1 +; CHECK-NEXT: vmulh.s16 q0, q0, q0 +; CHECK-NEXT: vqadd.s16 q0, q0, q2 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: vstrh.16 q0, [r1], #16 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: pop {r7, pc} entry: @@ -148,25 +139,14 @@ ; CHECK-LABEL: good2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: cmp r2, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #4 -; CHECK-NEXT: subs r3, r2, r3 -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmlavat.s32 r12, q1, q0 -; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: vmlava.s32 r12, q1, q0 +; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc}