diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -601,7 +601,8 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII); + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI); /// Return true if Reg is defd between From and To bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -669,7 +669,7 @@ Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII); + Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this); } assert(Done && "Unable to resolve frame index!"); (void)Done; @@ -781,7 +781,7 @@ Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); + Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this); } if (Done) return; @@ -789,21 +789,32 @@ // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above, handle the rest, providing a register that is // SP+LargeImm. - assert((Offset || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && - "This code isn't needed if offset already handled!"); + assert( + (Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == + ARMII::AddrModeT2_i7s4) && + "This code isn't needed if offset already handled!"); unsigned ScratchReg = 0; int PIdx = MI.findFirstPredOperandIdx(); ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); - if (Offset == 0) + + const MCInstrDesc &MCID = MI.getDesc(); + const TargetRegisterClass *RegClass = + TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent()); + + if (Offset == 0 && + (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg))) // Must be addrmode4/6. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false); else { - ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass); + ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass); if (!AFI->isThumbFunction()) emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, Pred, PredReg, TII); diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -470,12 +470,17 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII) { + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MI.getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); bool isSub = false; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetRegisterClass *RegClass = + TII.getRegClass(Desc, FrameRegIdx, TRI, MF); + // Memory operands in inline assembly always use AddrModeT2_i12. if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? @@ -645,10 +650,21 @@ MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); // Attempt to fold address computation - // Common case: small offset, fits into instruction. + // Common case: small offset, fits into instruction. We need to make sure + // the register class is correct too, for instructions like the MVE + // VLDRH.32, which only accepts low tGPR registers. int ImmedOffset = Offset / Scale; unsigned Mask = (1 << NumBits) - 1; - if ((unsigned)Offset <= Mask * Scale) { + if ((unsigned)Offset <= Mask * Scale && + (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg))) { + if (Register::isVirtualRegister(FrameReg)) { + // Make sure the register class for the virtual register is correct + MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MRI->constrainRegClass(FrameReg, RegClass)) + llvm_unreachable("Unable to constrain virtual register class."); + } + // Replace the FrameIndex with fp/sp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); if (isSub) { @@ -681,7 +697,8 @@ } Offset = (isSub) ? -Offset : Offset; - return Offset == 0; + return Offset == 0 && (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg)); } ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI, diff --git a/llvm/test/CodeGen/Thumb2/mve-stack.ll b/llvm/test/CodeGen/Thumb2/mve-stack.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-stack.ll @@ -0,0 +1,281 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @vstrw32() { +; CHECK-LABEL: vstrw32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [sp, #8] +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i32], align 2 + %g = getelementptr inbounds [4 x i32], [4 x i32]* %d, i32 0, i32 2 + %b = bitcast i32* %g to <4 x i32>* + store <4 x i32> zeroinitializer, <4 x i32>* %b, align 2 + %arraydecay = getelementptr inbounds [4 x i32], [4 x i32]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i32*)*)(i32* %arraydecay) + ret void +} + +define arm_aapcs_vfpcc void @vstrh16() { +; CHECK-LABEL: vstrh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrh.16 q0, [sp, #4] +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [8 x i16], align 2 + %g = getelementptr inbounds [8 x i16], [8 x i16]* %d, i32 0, i32 2 + %b = bitcast i16* %g to <8 x i16>* + store <8 x i16> zeroinitializer, <8 x i16>* %b, align 2 + %arraydecay = getelementptr inbounds [8 x i16], [8 x i16]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i16*)*)(i16* %arraydecay) + ret void +} + +define arm_aapcs_vfpcc void @vstrb8() { +; CHECK-LABEL: vstrb8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrh.16 q0, [sp, #2] +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [16 x i8], align 2 + %g = getelementptr inbounds [16 x i8], [16 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <16 x i8>* + store <16 x i8> zeroinitializer, <16 x i8>* %b, align 2 + %arraydecay = getelementptr inbounds [16 x i8], [16 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + ret void +} + +define arm_aapcs_vfpcc void @vstrh32() { +; CHECK-LABEL: vstrh32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vmov.i32 q0, #0x6 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrh.32 q0, [r0, #4] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i16], align 2 + %g = getelementptr inbounds [4 x i16], [4 x i16]* %d, i32 0, i32 2 + %b = bitcast i16* %g to <4 x i16>* + store <4 x i16> , <4 x i16>* %b, align 2 + %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i16*)*)(i16* %arraydecay) + ret void +} + +define arm_aapcs_vfpcc void @vstrb32() { +; CHECK-LABEL: vstrb32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vmov.i32 q0, #0x6 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrb.32 q0, [r0, #6] +; CHECK-NEXT: add r0, sp, #4 +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i8], align 2 + %g = getelementptr inbounds [4 x i8], [4 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <4 x i8>* + store <4 x i8> , <4 x i8>* %b, align 2 + %arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + ret void +} + +define arm_aapcs_vfpcc void @vstrb16() { +; CHECK-LABEL: vstrb16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrb.16 q0, [r0, #2] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [8 x i8], align 2 + %g = getelementptr inbounds [8 x i8], [8 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <8 x i8>* + store <8 x i8> zeroinitializer, <8 x i8>* %b, align 2 + %arraydecay = getelementptr inbounds [8 x i8], [8 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + ret void +} + + +define arm_aapcs_vfpcc <4 x i32> @vldrw32() { +; CHECK-LABEL: vldrw32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: vldrw.u32 q0, [sp, #8] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i32], align 2 + %arraydecay = getelementptr inbounds [4 x i32], [4 x i32]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i32*)*)(i32* %arraydecay) + %g = getelementptr inbounds [4 x i32], [4 x i32]* %d, i32 0, i32 2 + %b = bitcast i32* %g to <4 x i32>* + %l = load <4 x i32>, <4 x i32>* %b, align 2 + ret <4 x i32> %l +} + +define arm_aapcs_vfpcc <8 x i16> @vldrh16() { +; CHECK-LABEL: vldrh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: vldrh.u16 q0, [sp, #4] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [8 x i16], align 2 + %arraydecay = getelementptr inbounds [8 x i16], [8 x i16]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i16*)*)(i16* %arraydecay) + %g = getelementptr inbounds [8 x i16], [8 x i16]* %d, i32 0, i32 2 + %b = bitcast i16* %g to <8 x i16>* + %l = load <8 x i16>, <8 x i16>* %b, align 2 + ret <8 x i16> %l +} + +define arm_aapcs_vfpcc <16 x i8> @vldrb8() { +; CHECK-LABEL: vldrb8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: vldrh.u16 q0, [sp, #2] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [16 x i8], align 2 + %arraydecay = getelementptr inbounds [16 x i8], [16 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + %g = getelementptr inbounds [16 x i8], [16 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <16 x i8>* + %l = load <16 x i8>, <16 x i8>* %b, align 2 + ret <16 x i8> %l +} + +define arm_aapcs_vfpcc <4 x i16> @vldrh32() { +; CHECK-LABEL: vldrh32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrh.u32 q0, [r0, #4] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i16], align 2 + %arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i16*)*)(i16* %arraydecay) + %g = getelementptr inbounds [4 x i16], [4 x i16]* %d, i32 0, i32 2 + %b = bitcast i16* %g to <4 x i16>* + %l = load <4 x i16>, <4 x i16>* %b, align 2 + ret <4 x i16> %l +} + +define arm_aapcs_vfpcc <4 x i8> @vldrb32() { +; CHECK-LABEL: vldrb32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: add r0, sp, #4 +; CHECK-NEXT: bl func +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrb.u32 q0, [r0, #6] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [4 x i8], align 2 + %arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + %g = getelementptr inbounds [4 x i8], [4 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <4 x i8>* + %l = load <4 x i8>, <4 x i8>* %b, align 2 + ret <4 x i8> %l +} + +define arm_aapcs_vfpcc <8 x i8> @vldrb16() { +; CHECK-LABEL: vldrb16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: bl func +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrb.u16 q0, [r0, #2] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r7, pc} +entry: + %d = alloca [8 x i8], align 2 + %arraydecay = getelementptr inbounds [8 x i8], [8 x i8]* %d, i32 0, i32 0 + call arm_aapcs_vfpcc void bitcast (void (...)* @func to void (i8*)*)(i8* %arraydecay) + %g = getelementptr inbounds [8 x i8], [8 x i8]* %d, i32 0, i32 2 + %b = bitcast i8* %g to <8 x i8>* + %l = load <8 x i8>, <8 x i8>* %b, align 2 + ret <8 x i8> %l +} + +declare dso_local arm_aapcs_vfpcc void @func(...)