Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -913,9 +913,10 @@ multiclass MVEPredicated rets, list params, LLVMType pred = llvm_anyvector_ty, - list props = [IntrNoMem]> { - def "": Intrinsic; - def _predicated: Intrinsic; + list props = [IntrNoMem], + list sdprops = []> { + def "": Intrinsic; + def _predicated: Intrinsic; } multiclass MVEPredicatedM rets, list params, LLVMType pred = llvm_anyvector_ty, @@ -963,16 +964,17 @@ defm int_arm_mve_vldr_gather_base: MVEPredicated< [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], - llvm_anyvector_ty, [IntrReadMem]>; + llvm_anyvector_ty, [IntrReadMem], [SDNPMemOperand]>; defm int_arm_mve_vldr_gather_base_wb: MVEPredicated< [llvm_anyvector_ty, llvm_anyvector_ty], - [LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; + [LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem], + [SDNPMemOperand]>; defm int_arm_mve_vstr_scatter_base: MVEPredicated< [], [llvm_anyvector_ty, llvm_i32_ty, llvm_anyvector_ty], - llvm_anyvector_ty, [IntrWriteMem]>; + llvm_anyvector_ty, [IntrWriteMem], [SDNPMemOperand]>; defm int_arm_mve_vstr_scatter_base_wb: MVEPredicated< [llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], - llvm_anyvector_ty, [IntrWriteMem]>; + llvm_anyvector_ty, [IntrWriteMem], [SDNPMemOperand]>; // gather_offset takes three i32 parameters. The first is the size of // memory element loaded, in bits. The second is a left bit shift to @@ -985,10 +987,12 @@ // narrows rather than widening, it doesn't have the last one. defm int_arm_mve_vldr_gather_offset: MVEPredicated< [llvm_anyvector_ty], [llvm_anyptr_ty, llvm_anyvector_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem], + [SDNPMemOperand]>; defm int_arm_mve_vstr_scatter_offset: MVEPredicated< [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty, - llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>; + llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem], + [SDNPMemOperand]>; def int_arm_mve_shl_imm_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], Index: llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2549,6 +2549,7 @@ ReplaceUses(SDValue(N, 0), SDValue(New, 1)); ReplaceUses(SDValue(N, 1), SDValue(New, 0)); ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + transferMemOperands(N, New); CurDAG->RemoveDeadNode(N); } Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19318,6 +19318,66 @@ Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_mve_vldr_gather_base: + case Intrinsic::arm_mve_vldr_gather_base_predicated: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getType()); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vldr_gather_base_wb: + case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vldr_gather_offset: + case Intrinsic::arm_mve_vldr_gather_offset_predicated: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getType()); + unsigned MemSize = cast(I.getArgOperand(2))->getZExtValue(); + Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), + DataVT.getVectorNumElements()); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vstr_scatter_base: + case Intrinsic::arm_mve_vstr_scatter_base_predicated: { + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOStore; + return true; + } + case Intrinsic::arm_mve_vstr_scatter_base_wb: + case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOStore; + return true; + } + case Intrinsic::arm_mve_vstr_scatter_offset: + case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned MemSize = cast(I.getArgOperand(3))->getZExtValue(); + Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), + DataVT.getVectorNumElements()); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); Index: llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll +++ llvm/test/CodeGen/Thumb2/mve-gatherscatter-mmo.ll @@ -2,7 +2,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_s16(i8* %base, <8 x i16> %offset) { ; CHECK-LABEL: name: test_vldrbq_gather_offset_s16 -; CHECK: early-clobber %2:mqpr = MVE_VLDRBS16_rq %0, %1, 0, $noreg +; CHECK: early-clobber %2:mqpr = MVE_VLDRBS16_rq %0, %1, 0, $noreg :: (load 8, align 1) entry: %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0) ret <8 x i16> %0 @@ -10,7 +10,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_s32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { ; CHECK-LABEL: name: test_vldrbq_gather_offset_z_s32 -; CHECK: early-clobber %4:mqpr = MVE_VLDRBS32_rq %0, %1, 1, killed %3 +; CHECK: early-clobber %4:mqpr = MVE_VLDRBS32_rq %0, %1, 1, killed %3 :: (load 4, align 1) entry: %0 = zext i16 %p to i32 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) @@ -20,7 +20,7 @@ define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_s64(<2 x i64> %addr) { ; CHECK-LABEL: name: test_vldrdq_gather_base_s64 -; CHECK: early-clobber %1:mqpr = MVE_VLDRDU64_qi %0, 616, 0, $noreg +; CHECK: early-clobber %1:mqpr = MVE_VLDRDU64_qi %0, 616, 0, $noreg :: (load 16, align 1) entry: %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 616) ret <2 x i64> %0 @@ -28,7 +28,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_z_f32(<4 x i32> %addr, i16 zeroext %p) { ; CHECK-LABEL: name: test_vldrwq_gather_base_z_f32 -; CHECK: early-clobber %3:mqpr = MVE_VLDRWU32_qi %0, -300, 1, killed %2 +; CHECK: early-clobber %3:mqpr = MVE_VLDRWU32_qi %0, -300, 1, killed %2 :: (load 16, align 1) entry: %0 = zext i16 %p to i32 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) @@ -38,7 +38,7 @@ define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(<2 x i64>* %addr) { ; CHECK-LABEL: name: test_vldrdq_gather_base_wb_s64 -; CHECK: %2:mqpr, early-clobber %3:mqpr = MVE_VLDRDU64_qi_pre %1, 576, 0, $noreg +; CHECK: %2:mqpr, early-clobber %3:mqpr = MVE_VLDRDU64_qi_pre %1, 576, 0, $noreg :: (load 16, align 1) entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 576) @@ -50,7 +50,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(<4 x i32>* %addr, i16 zeroext %p) { ; CHECK-LABEL: name: test_vldrwq_gather_base_wb_z_f32 -; CHECK: %4:mqpr, early-clobber %5:mqpr = MVE_VLDRWU32_qi_pre %3, -352, 1, killed %2 +; CHECK: %4:mqpr, early-clobber %5:mqpr = MVE_VLDRWU32_qi_pre %3, -352, 1, killed %2 :: (load 16, align 1) entry: %0 = load <4 x i32>, <4 x i32>* %addr, align 8 %1 = zext i16 %p to i32 @@ -65,7 +65,7 @@ define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { ; CHECK-LABEL: name: test_vstrbq_scatter_offset_s32 -; CHECK: MVE_VSTRB32_rq %2, %0, %1, 0, $noreg +; CHECK: MVE_VSTRB32_rq %2, %0, %1, 0, $noreg :: (store 4, align 1) entry: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) ret void @@ -73,7 +73,7 @@ define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { ; CHECK-LABEL: name: test_vstrbq_scatter_offset_p_s8 -; CHECK: MVE_VSTRB8_rq %2, %0, %1, 1, killed %4 +; CHECK: MVE_VSTRB8_rq %2, %0, %1, 1, killed %4 :: (store 16, align 1) entry: %0 = zext i16 %p to i32 %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) @@ -83,7 +83,7 @@ define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_u64(<2 x i64> %addr, <2 x i64> %value) { ; CHECK-LABEL: name: test_vstrdq_scatter_base_u64 -; CHECK: MVE_VSTRD64_qi %1, %0, -472, 0, $noreg +; CHECK: MVE_VSTRD64_qi %1, %0, -472, 0, $noreg :: (store 16, align 1) entry: call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 -472, <2 x i64> %value) ret void @@ -91,7 +91,7 @@ define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { ; CHECK-LABEL: name: test_vstrdq_scatter_base_p_s64 -; CHECK: MVE_VSTRD64_qi %1, %0, 888, 1, killed %3 +; CHECK: MVE_VSTRD64_qi %1, %0, 888, 1, killed %3 :: (store 16, align 1) entry: %0 = zext i16 %p to i32 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) @@ -101,7 +101,7 @@ define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_s64(<2 x i64>* %addr, <2 x i64> %value) { ; CHECK-LABEL: name: test_vstrdq_scatter_base_wb_s64 -; CHECK: %3:mqpr = MVE_VSTRD64_qi_pre %1, %2, 208, 0, $noreg +; CHECK: %3:mqpr = MVE_VSTRD64_qi_pre %1, %2, 208, 0, $noreg :: (store 16, align 1) entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 208, <2 x i64> %value) @@ -111,7 +111,7 @@ define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { ; CHECK-LABEL: name: test_vstrdq_scatter_base_wb_p_s64 -; CHECK: %5:mqpr = MVE_VSTRD64_qi_pre %1, %3, 248, 1, killed %4 +; CHECK: %5:mqpr = MVE_VSTRD64_qi_pre %1, %3, 248, 1, killed %4 :: (store 16, align 1) entry: %0 = load <2 x i64>, <2 x i64>* %addr, align 8 %1 = zext i16 %p to i32 Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1163,17 +1163,17 @@ ; CHECK-NEXT: bne .LBB7_6 ; CHECK-NEXT: b .LBB7_2 ; CHECK-NEXT: .LBB7_9: -; CHECK-NEXT: adr r1, .LCPI7_0 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: lsrs r0, r0, #3 -; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! ; CHECK-NEXT: wls lr, r0, .LBB7_12 ; CHECK-NEXT: @ %bb.10: +; CHECK-NEXT: adr r0, .LCPI7_0 ; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! ; CHECK-NEXT: vldrw.u32 q0, [q1, #16] ; CHECK-NEXT: .LBB7_11: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q3, [q1, #24]