diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -1167,6 +1167,8 @@ bool Changed = false; for (BasicBlock &BB : F) { + SimplifyInstructionsInBlock(&BB); + for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); if (II && II->getIntrinsicID() == Intrinsic::masked_gather && diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) { ; CHECK-LABEL: remat_vctp: diff --git a/llvm/test/CodeGen/Thumb2/lsll0.ll b/llvm/test/CodeGen/Thumb2/lsll0.ll --- a/llvm/test/CodeGen/Thumb2/lsll0.ll +++ b/llvm/test/CodeGen/Thumb2/lsll0.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s define void @_Z4loopPxS_iS_i(i64* %d) { ; CHECK-LABEL: _Z4loopPxS_iS_i: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -321,26 +321,29 @@ ret void; } -define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { +define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) { ; CHECK-LABEL: non_gatscat_use1: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: adr r3, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q0, #0x8 -; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI7_0 +; CHECK-NEXT: vmov.i32 q0, #0x9 +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: vmov.i32 q1, #0xc +; CHECK-NEXT: vmov.i32 q2, #0x8 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q2, q0 -; CHECK-NEXT: vmlas.u32 q2, q1, r0 -; CHECK-NEXT: vldrw.u32 q4, [q2, #24] +; CHECK-NEXT: vadd.i32 q4, q3, q2 +; CHECK-NEXT: vmul.i32 q5, q3, q0 +; CHECK-NEXT: vmlas.u32 q3, q1, r0 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q6, [q3, #24] +; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vstrw.32 q5, [r3] +; CHECK-NEXT: vstrb.8 q6, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -364,6 +367,7 @@ %4 = bitcast i32* %3 to <4 x i32>* store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4 %non_gatscat_use = mul <4 x i32> %0, + store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4 %index.next = add i32 %index, 4 %vec.ind.next = add <4 x i32> %vec.ind, %5 = icmp eq i32 %index.next, %n.vec @@ -373,26 +377,31 @@ ret void; } -define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { +define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) { ; CHECK-LABEL: non_gatscat_use2: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: adr r3, .LCPI8_0 -; CHECK-NEXT: vmov.i32 q0, #0x8 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: vmov.i32 q1, #0xc +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: adr.w r12, .LCPI8_0 +; CHECK-NEXT: vmov.i32 q0, #0x12 +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: vmov.i32 q1, #0x9 +; CHECK-NEXT: vmov.i32 q2, #0x8 +; CHECK-NEXT: vmov.i32 q3, #0xc ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q2, q0 -; CHECK-NEXT: vmlas.u32 q2, q1, r0 -; CHECK-NEXT: vldrw.u32 q4, [q2, #24] +; CHECK-NEXT: vadd.i32 q5, q4, q2 +; CHECK-NEXT: vmul.i32 q6, q4, q1 +; CHECK-NEXT: vmlas.u32 q4, q3, r0 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q7, [q4, #24] +; CHECK-NEXT: vadd.i32 q4, q6, q0 +; CHECK-NEXT: vstrw.32 q4, [r3] +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: vstrb.8 q7, [r1], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -416,6 +425,7 @@ %4 = bitcast i32* %3 to <4 x i32>* store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4 %non_gatscat_use = mul <4 x i32> %1, + store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4 %index.next = add i32 %index, 4 %vec.ind.next = add <4 x i32> %vec.ind, %5 = icmp eq i32 %index.next, %n.vec @@ -849,12 +859,12 @@ ; CHECK-NEXT: add.w r8, r7, #10 ; CHECK-NEXT: adr r7, .LCPI11_0 ; CHECK-NEXT: ldr r1, [sp, #96] -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vldrw.u32 q1, [r7] ; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: mov.w r9, #6 ; CHECK-NEXT: movs r6, #11 -; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB11_1: @ %for.body10.i ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -889,10 +899,10 @@ ; CHECK-NEXT: mul r4, r11, r6 ; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vdup.32 q2, r7 -; CHECK-NEXT: vadd.i32 q4, q0, r4 +; CHECK-NEXT: vadd.i32 q4, q1, r4 ; CHECK-NEXT: vmla.u32 q3, q4, r2 ; CHECK-NEXT: adds r4, #113 -; CHECK-NEXT: vadd.i32 q4, q0, r4 +; CHECK-NEXT: vadd.i32 q4, q1, r4 ; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmla.u32 q2, q4, r2 ; CHECK-NEXT: .LBB11_5: @ %vector.body @@ -902,8 +912,8 @@ ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 ; CHECK-NEXT: vldrb.s32 q6, [r0, q2] -; CHECK-NEXT: vadd.i32 q5, q2, q1 -; CHECK-NEXT: vadd.i32 q4, q3, q1 +; CHECK-NEXT: vadd.i32 q5, q2, q0 +; CHECK-NEXT: vadd.i32 q4, q3, q0 ; CHECK-NEXT: subs r4, #4 ; CHECK-NEXT: vadd.i32 q2, q6, r2 ; CHECK-NEXT: vldrb.s32 q6, [r1, q3] diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll b/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s + +; This files has some unused gathers, making sure that they do not cause +; problems as the function gets simplified. + +define arm_aapcs_vfpcc void @unused1(<4 x i32*> %offs) { +; CHECK-LABEL: unused1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> , <4 x i32> undef) + ret void +} + +define arm_aapcs_vfpcc void @unused2(<4 x i32*> %offs) { +; CHECK-LABEL: unused2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> , <4 x i32> undef) + %gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> , <4 x i32> undef) + ret void +} + +define arm_aapcs_vfpcc void @unused2_used(<4 x i32*> %offs) { +; CHECK-LABEL: unused2_used: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> , <4 x i32> undef) + %gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> , <4 x i32> undef) + %unused = add <4 x i32> %gather1, %gather2 + ret void +} + + +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s ; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads. diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll @@ -170,8 +170,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpugez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vcmp.i32 eq, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll --- a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: test_v4i32: diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -70,8 +70,6 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_i16_c: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.u16 r1, q0[0] @@ -86,41 +84,37 @@ ; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 ; CHECK-NEXT: vmullb.s16 q0, q3, q0 -; CHECK-NEXT: vmov.i32 q3, #0x7fff ; CHECK-NEXT: vshl.i32 q0, q0, #10 ; CHECK-NEXT: vshr.s32 q0, q0, #10 -; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vmin.s32 q4, q0, q3 -; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vshr.s32 q3, q0, #15 +; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r0, r1, d7 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[6] ; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.u16 r1, q1[5] ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmullb.s16 q1, q2, q4 +; CHECK-NEXT: vmullb.s16 q1, q2, q3 ; CHECK-NEXT: vshl.i32 q1, q1, #10 ; CHECK-NEXT: vshr.s32 q1, q1, #10 ; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmin.s32 q1, q1, q3 ; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: vmov.16 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i16> %s0 to <8 x i22>