Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -568,6 +568,28 @@ } } + // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect + // world the [w|d]lstp instruction would be last instruction in the preheader + // and so it would only affect instructions within the loop body. But due to + // scheduling, and/or the logic in this pass (above), the insertion point can + // be moved earlier. So if the Loop Start isn't the last instruction in the + // preheader, and if the initial element count is smaller than the vector + // width, the Loop Start instruction will immediately generate one or more + // false lane mask which can, incorrectly, affect the proceeding MVE + // instructions in the preheader. + auto cannotInsertWDLSTPBetween = [](MachineInstr *Begin, + MachineInstr *End) { + auto I = MachineBasicBlock::iterator(Begin); + auto E = MachineBasicBlock::iterator(End); + for (; I != E; ++I) + if (shouldInspect(*I)) + return true; + return false; + }; + + if (cannotInsertWDLSTPBetween(StartInsertPt, &InsertBB->back())) + return false; + // Especially in the case of while loops, InsertBB may not be the // preheader, so we need to check that the register isn't redefined // before entering the loop. Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir @@ -108,22 +108,33 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q2 = MVE_VMOVimmi32 1, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $q3 = MVE_VMOVimmi32 4, 0, $noreg, undef renamable $q3 - ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r3 + ; CHECK: $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) - ; CHECK: renamable $r3, dead $cpsr = tLSRri killed renamable $r2, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $q1, $q2, $q3, $r0, $r1 - ; CHECK: MVE_VPTv4u32 2, renamable $q1, renamable $q0, 8, implicit-def $vpr + ; CHECK: liveins: $q0, $q1, $q2, $q3, $r0, $r1, $r2, $r4 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 1, implicit $vpr + ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q1, renamable $q0, 8, 1, killed renamable $vpr ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q0, renamable $q2, 2, 1, killed renamable $vpr ; CHECK: renamable $r1, renamable $q4 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv35, align 4) ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q4, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4) ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q3, 0, $noreg, undef renamable $q0 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9 ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir @@ -153,17 +153,25 @@ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool) - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 ; CHECK: $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q1, $r0, $r1 - ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4) - ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4) + ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q1 ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg @@ -277,18 +285,27 @@ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool) ; CHECK: renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 ; CHECK: $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q1, $r0, $r1 - ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) - ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q1 ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir @@ -152,35 +152,47 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK: tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 10, 8, implicit-def $itstate + ; CHECK: renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate + ; CHECK: renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg + ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: $r12 = tMOVr $r0, 14 /* CC::al */, $noreg ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: $r4 = tMOVr $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body.i: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r12 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12 ; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4) ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.arm_mean_f32_mve.exit: ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: liveins: $q0, $r0, $r1, $r2 + ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 $r1 + ; CHECK: $lr = t2DLS killed $r4 ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0 ; CHECK: renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 - ; CHECK: dead $r3 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: bb.3.do.body: ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) - ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2 - ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4) - ; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 - ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.3 + ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.01, align 4) + ; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2 + ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 1, killed renamable $vpr + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 ; CHECK: bb.4.do.end: ; CHECK: liveins: $q0, $r1, $r2 ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -231,12 +231,17 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB3_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: adr r7, .LCPI3_5 +; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: vmov.i32 q0, #0x8000 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: adr r6, .LCPI3_4 ; CHECK-NEXT: adr r5, .LCPI3_3 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: adr r4, .LCPI3_2 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: adr.w r8, .LCPI3_1 @@ -272,14 +277,19 @@ ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q4, [r0, q0] +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q4, [r0, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q7, [r0, q0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q7, [r0, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i32 q6, q7, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q1, [r0, q5] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q1, [r0, q5] ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmul.i32 q3, q4, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload @@ -310,12 +320,14 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vstrb.32 q1, [r1, q0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r1, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vstrb.32 q2, [r1, q0] -; CHECK-NEXT: vstrb.32 q6, [r1, q5] +; CHECK-NEXT: vpstt +; CHECK-NEXT: vstrbt.32 q2, [r1, q0] +; CHECK-NEXT: vstrbt.32 q6, [r1, q5] ; CHECK-NEXT: adds r1, #12 -; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #216 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}