diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5829,6 +5829,7 @@ let mayLoad = load; let mayStore = !eq(load,0); let hasSideEffects = 0; + let validForTailPredication = load; } // A parameter class used to encapsulate all the ways the writeback diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -782,7 +782,7 @@ // the false lanes are zeroed and here we're trying to track that those false // lanes remain zero, or where they change, the differences are masked away // by their user(s). - // All MVE loads and stores have to be predicated, so we know that any load + // All MVE stores have to be predicated, so we know that any predicate load // operands, or stored results are equivalent already. Other explicitly // predicated instructions will perform the same operation in the original // loop and the tail-predicated form too. Because of this, we can insert @@ -1038,8 +1038,8 @@ } // If the instruction is already explicitly predicated, then the conversion - // will be fine, but ensure that all memory operations are predicated. - return !IsUse && MI->mayLoadOrStore() ? false : true; + // will be fine, but ensure that all store operations are predicated. + return !IsUse && MI->mayStore() ? false : true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -6,26 +6,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: subs.w r12, r2, #8 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: csinv r3, r3, r12, pl -; CHECK-NEXT: add.w r12, r3, r2 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vmulht.s16 q2, q1, q1 -; CHECK-NEXT: vmulht.s16 q0, q0, q0 -; CHECK-NEXT: vqaddt.s16 q0, q0, q2 -; CHECK-NEXT: vshrt.s16 q0, q0, #1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r1], #16 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vmulh.s16 q2, q1, q1 +; CHECK-NEXT: vmulh.s16 q0, q0, q0 +; CHECK-NEXT: vqadd.s16 q0, q0, q2 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: vstrh.16 q0, [r1], #16 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: pop {r7, pc} entry: @@ -148,25 +139,14 @@ ; CHECK-LABEL: good2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: cmp r2, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #4 -; CHECK-NEXT: subs r3, r2, r3 -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmlavat.s32 r12, q1, q0 -; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: vmlava.s32 r12, q1, q0 +; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp --- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -382,7 +382,7 @@ return false; case MVE_ASRLi: case MVE_ASRLr: - case MVE_LSRL: + case MVE_LSRL: case MVE_SQRSHR: case MVE_SQSHL: case MVE_SRSHR: @@ -393,7 +393,7 @@ case MVE_VABDf32: case MVE_VABDs16: case MVE_VABDs32: - case MVE_VABDs8: + case MVE_VABDs8: case MVE_VABDu16: case MVE_VABDu32: case MVE_VABDu8: @@ -609,6 +609,42 @@ case MVE_VIWDUPu16: case MVE_VIWDUPu32: case MVE_VIWDUPu8: + case MVE_VLD20_8: + case MVE_VLD21_8: + case MVE_VLD20_16: + case MVE_VLD21_16: + case MVE_VLD20_32: + case MVE_VLD21_32: + case MVE_VLD20_8_wb: + case MVE_VLD21_8_wb: + case MVE_VLD20_16_wb: + case MVE_VLD21_16_wb: + case MVE_VLD20_32_wb: + case MVE_VLD21_32_wb: + case MVE_VLD40_8: + case MVE_VLD41_8: + case MVE_VLD42_8: + case MVE_VLD43_8: + case MVE_VLD40_16: + case MVE_VLD41_16: + case MVE_VLD42_16: + case MVE_VLD43_16: + case MVE_VLD40_32: + case MVE_VLD41_32: + case MVE_VLD42_32: + case MVE_VLD43_32: + case MVE_VLD40_8_wb: + case MVE_VLD41_8_wb: + case MVE_VLD42_8_wb: + case MVE_VLD43_8_wb: + case MVE_VLD40_16_wb: + case MVE_VLD41_16_wb: + case MVE_VLD42_16_wb: + case MVE_VLD43_16_wb: + case MVE_VLD40_32_wb: + case MVE_VLD41_32_wb: + case MVE_VLD42_32_wb: + case MVE_VLD43_32_wb: case MVE_VLDRBS16: case MVE_VLDRBS16_post: case MVE_VLDRBS16_pre: @@ -657,9 +693,9 @@ case MVE_VLDRWU32_rq_u: case MVE_VMOVimmf32: case MVE_VMOVimmi16: - case MVE_VMOVimmi32: + case MVE_VMOVimmi32: case MVE_VMOVimmi64: - case MVE_VMOVimmi8: + case MVE_VMOVimmi8: case MVE_VMOVNi16bh: case MVE_VMOVNi16th: case MVE_VMOVNi32bh: @@ -679,7 +715,7 @@ case MVE_VMULLTs8: case MVE_VMULLTu16: case MVE_VMULLTu32: - case MVE_VMULLTu8: + case MVE_VMULLTu8: case MVE_VMUL_qr_f16: case MVE_VMUL_qr_f32: case MVE_VMUL_qr_i16: @@ -702,7 +738,7 @@ case MVE_VORR: case MVE_VORRimmi16: case MVE_VORRimmi32: - case MVE_VPST: + case MVE_VPST: case MVE_VQABSs16: case MVE_VQABSs32: case MVE_VQABSs8: @@ -814,7 +850,7 @@ case MVE_VRHADDs32: case MVE_VRHADDs8: case MVE_VRHADDu16: - case MVE_VRHADDu32: + case MVE_VRHADDu32: case MVE_VRHADDu8: case MVE_VRINTf16A: case MVE_VRINTf16M: @@ -825,12 +861,12 @@ case MVE_VRINTf32A: case MVE_VRINTf32M: case MVE_VRINTf32N: - case MVE_VRINTf32P: - case MVE_VRINTf32X: + case MVE_VRINTf32P: + case MVE_VRINTf32X: case MVE_VRINTf32Z: case MVE_VRSHL_by_vecs16: case MVE_VRSHL_by_vecs32: - case MVE_VRSHL_by_vecs8: + case MVE_VRSHL_by_vecs8: case MVE_VRSHL_by_vecu16: case MVE_VRSHL_by_vecu32: case MVE_VRSHL_by_vecu8: @@ -887,7 +923,7 @@ case MVE_VSTRB16_rq: case MVE_VSTRB32: case MVE_VSTRB32_post: - case MVE_VSTRB32_pre: + case MVE_VSTRB32_pre: case MVE_VSTRB32_rq: case MVE_VSTRB8_rq: case MVE_VSTRBU8: @@ -957,7 +993,9 @@ for (auto &Op : Desc.operands()) { // Only check instructions that access the MQPR regs. if ((Op.OperandType & MCOI::OPERAND_REGISTER) == 0 || - Op.RegClass != ARM::MQPRRegClassID) + (Op.RegClass != ARM::MQPRRegClassID && + Op.RegClass != ARM::QQPRRegClassID && + Op.RegClass != ARM::QQQQPRRegClassID)) continue; uint64_t Flags = MII->get(i).TSFlags;