Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -809,9 +809,14 @@ switch (II->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::prefetch: + case Intrinsic::masked_load: if (II->getArgOperand(0) == OperandVal) isAddress = true; break; + case Intrinsic::masked_store: + if (II->getArgOperand(1) == OperandVal) + isAddress = true; + break; case Intrinsic::memmove: case Intrinsic::memcpy: if (II->getArgOperand(0) == OperandVal || @@ -861,6 +866,15 @@ AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace(); AccessTy.MemTy = OperandVal->getType(); break; + case Intrinsic::masked_load: + AccessTy.AddrSpace = + II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + case Intrinsic::masked_store: + AccessTy.MemTy = II->getOperand(0)->getType(); + AccessTy.AddrSpace = + II->getArgOperand(1)->getType()->getPointerAddressSpace(); + break; default: { MemIntrinsicInfo IntrInfo; if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -693,92 +693,84 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr.w r8, [sp, #80] +; CHECK-NEXT: ldr r7, [sp, #84] ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r0, [sp, #64] -; CHECK-NEXT: add.w r1, r8, #7 -; CHECK-NEXT: add.w r3, r8, r8, lsl #1 +; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #68] +; CHECK-NEXT: add.w r1, r3, r7, lsl #1 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r1, r3, r7 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r7, r7, lsl #1 ; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: lsl.w r5, r8, #1 -; CHECK-NEXT: lsrs r0, r1, #3 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: adds r0, r7, #7 +; CHECK-NEXT: lsr.w r9, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: add.w r0, r10, r4 -; CHECK-NEXT: ldr r1, [sp, #88] +; CHECK-NEXT: ldr r1, [sp, #92] +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp r11, r0 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #84] +; CHECK-NEXT: ldr r0, [sp, #88] +; CHECK-NEXT: subs.w lr, r9, r9 ; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w lr, r0, r0 ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mla r7, r11, r8, r0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r9, r8 +; CHECK-NEXT: ldr r3, [sp, #84] ; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r5, r11, r3, r0 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.16 r9 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrbt.s16 q1, [r0], #8 -; CHECK-NEXT: vaddt.i16 q2, q1, q0 -; CHECK-NEXT: vldrbt.s16 q1, [r7], #8 -; CHECK-NEXT: vmlavat.s16 r4, q1, q2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r2] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r12, q1, q2 -; CHECK-NEXT: adds r2, r1, r5 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r2] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r6, q1, q2 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: sub.w r9, r9, #8 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r1] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r10, q1, q2 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: le lr, .LBB5_7 +; CHECK-NEXT: vldrb.s16 q1, [r4], #8 +; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q1, [r5], #8 +; CHECK-NEXT: vmlava.s16 r12, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r0], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r6, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r7], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r8, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r1], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -887,91 +879,83 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr.w r8, [sp, #80] +; CHECK-NEXT: ldr r7, [sp, #84] ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r0, [sp, #64] -; CHECK-NEXT: add.w r1, r8, #7 -; CHECK-NEXT: add.w r3, r8, r8, lsl #1 +; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #68] +; CHECK-NEXT: add.w r1, r3, r7, lsl #1 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r1, r3, r7 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r7, r7, lsl #1 ; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: lsl.w r5, r8, #1 -; CHECK-NEXT: lsrs r0, r1, #3 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: adds r0, r7, #7 +; CHECK-NEXT: lsr.w r9, r0, #3 ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #84] +; CHECK-NEXT: ldr r0, [sp, #88] +; CHECK-NEXT: subs.w lr, r9, r9 ; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w lr, r0, r0 ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mla r7, r11, r8, r0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r9, r8 +; CHECK-NEXT: ldr r3, [sp, #84] ; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r5, r11, r3, r0 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.16 r9 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrbt.s16 q1, [r0], #8 -; CHECK-NEXT: vaddt.i16 q2, q1, q0 -; CHECK-NEXT: vldrbt.s16 q1, [r7], #8 -; CHECK-NEXT: vmlavat.s16 r4, q1, q2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r2] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r12, q1, q2 -; CHECK-NEXT: adds r2, r1, r5 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r2] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r6, q1, q2 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: sub.w r9, r9, #8 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrbt.s16 q2, [r1] -; CHECK-NEXT: vaddt.i16 q2, q2, q0 -; CHECK-NEXT: vmlavat.s16 r10, q1, q2 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: le lr, .LBB6_5 +; CHECK-NEXT: vldrb.s16 q1, [r4], #8 +; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q1, [r5], #8 +; CHECK-NEXT: vmlava.s16 r12, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r0], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r6, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r7], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r8, q1, q2 +; CHECK-NEXT: vldrb.s16 q2, [r1], #8 +; CHECK-NEXT: vadd.i16 q2, q2, q0 +; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r0, r10, r4 -; CHECK-NEXT: ldr r1, [sp, #88] +; CHECK-NEXT: ldr r1, [sp, #92] +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp r11, r0 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4