diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -592,6 +592,13 @@ MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br, const TargetRegisterInfo *TRI); +void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB); +void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg); + +void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); +void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, + unsigned Inactive); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -805,6 +805,28 @@ .addReg(ARM::CPSR, RegState::Implicit | RegState::Define); } +void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) { + MIB.addImm(ARMVCC::None); + MIB.addReg(0); +} + +void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned DestReg) { + addUnpredicatedMveVpredNOp(MIB); + MIB.addReg(DestReg, RegState::Undef); +} + +void llvm::addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond) { + MIB.addImm(Cond); + MIB.addReg(ARM::VPR, RegState::Implicit); +} + +void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned Cond, unsigned Inactive) { + addPredicatedMveVpredNOp(MIB, Cond); + MIB.addReg(Inactive); +} + void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, @@ -833,14 +855,17 @@ else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64()) Opc = ARM::VMOVD; else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; if (Opc) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); MIB.addReg(SrcReg, getKillRegState(KillSrc)); - if (Opc == ARM::VORRq) + if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) MIB.addReg(SrcReg, getKillRegState(KillSrc)); - MIB.add(predOps(ARMCC::AL)); + if (Opc == ARM::MVE_VORR) + addUnpredicatedMveVpredROp(MIB, DestReg); + else + MIB.add(predOps(ARMCC::AL)); return; } @@ -901,6 +926,30 @@ } else if (DestReg == ARM::CPSR) { copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget); return; + } else if (DestReg == ARM::VPR) { + assert(ARM::GPRPairRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::VPR) { + assert(ARM::GPRPairRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (DestReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRPairRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRPairRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; } assert(Opc && "Impossible reg-to-reg copy"); @@ -1010,6 +1059,13 @@ .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_P0_off)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1042,7 +1098,7 @@ llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { // Use aligned spills if the stack can be realigned. if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64)) @@ -1058,6 +1114,14 @@ .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::MVE_VSTRWU32_off)); + MIB.addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1155,6 +1219,13 @@ return MI.getOperand(0).getReg(); } break; + case ARM::VSTR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VST1q64: case ARM::VST1d64TPseudo: case ARM::VST1d64QPseudo: @@ -1225,6 +1296,12 @@ .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(ARM::VLDR_P0_off), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1261,7 +1338,7 @@ llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) .addFrameIndex(FI) @@ -1274,6 +1351,13 @@ .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DL, get(ARM::MVE_VLDRWU32_off), DestReg); + MIB.addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1370,6 +1454,13 @@ return MI.getOperand(0).getReg(); } break; + case ARM::VLDR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VLD1q64: case ARM::VLD1d8TPseudo: case ARM::VLD1d16TPseudo: diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -146,6 +146,9 @@ SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); + template + bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); @@ -1268,6 +1271,35 @@ return false; } +template +bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || + CurDAG->isBaseWithConstantOffset(N)) { + if (auto RHS = dyn_cast(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (isShiftedInt<7, Shift>(RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -812,6 +812,8 @@ MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + void addMVEITypes(); + void addMVEFPTypes(); }; enum NEONModImmType { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -221,6 +221,32 @@ addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } +void ARMTargetLowering::addMVEITypes() { + const MVT iTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }; + + for (auto VT : iTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + } +} + +void ARMTargetLowering::addMVEFPTypes() { + const MVT fTypes[] = { MVT::v2f64, MVT::v4f32, MVT::v8f16 }; + + for (MVT VT : fTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + } +} + ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -548,6 +574,12 @@ setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasMVEIntegerOps()) + addMVEITypes(); + + if (Subtarget->hasMVEFloatOps()) + addMVEFPTypes(); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -566,11 +598,11 @@ addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + } + if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { // v2f64 is legal so that QR subregs can be extracted as f64 elements, but - // neither Neon nor VFP support any arithmetic operations on it. - // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively - // supported for v4f32. + // none of Neon, MVE or VFP supports any arithmetic operations on it. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); @@ -604,7 +636,11 @@ setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); + } + if (Subtarget->hasNEON()) { + // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively + // supported for v4f32. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4027,3 +4027,63 @@ let Unpredictable{20-21} = 0b11; let Unpredictable{11-0} = 0b111111111111; } + +class MVE_unpred_vector_store_typed + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7:$addr)>; + +multiclass MVE_unpred_vector_store { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; +} + +class MVE_unpred_vector_load_typed + : Pat<(Ty (LoadKind t2addrmode_imm7:$addr)), + (Ty (RegImmInst t2addrmode_imm7:$addr))>; +multiclass MVE_unpred_vector_load { + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} + +let Predicates = [HasMVEInt, IsLE] in { + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + + def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), + (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), + (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), + (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; +} + +let Predicates = [HasMVEInt, IsBE] in { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -474,8 +474,11 @@ 128, (interleave QPR, TuplesOE2D)> { // Allocate starting at non-VFP2 registers D16-D31 first. // Prefer even-odd pairs as they are easier to copy. - let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))]; - let AltOrderSelect = [{ return 1; }]; + let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16)), + (add (trunc QPR, 8), (trunc DPair, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().hasMVEIntegerOps(); + }]; } // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -68,6 +68,10 @@ /// to llvm::getInstrPredicate except it returns AL for conditional branch /// instructions which are "predicated", but are not in IT blocks. ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg); + +// getVPTInstrPredicate: VPT analogue of that. +ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI, + unsigned &PredReg); } #endif diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -691,3 +691,28 @@ return ARMCC::AL; return getInstrPredicate(MI, PredReg); } + +static int findFirstVPTPredOperandIdx(const llvm::MachineInstr &MI) { + const llvm::MCInstrDesc &MCID = MI.getDesc(); + + if (!MCID.OpInfo) + return -1; + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (llvm::ARM::isVpred(MCID.OpInfo[i].OperandType)) + return i; + + return -1; +} + +ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, + unsigned &PredReg) { + int PIdx = findFirstVPTPredOperandIdx(MI); + if (PIdx == -1) { + PredReg = 0; + return ARMVCC::None; + } + + PredReg = MI.getOperand(PIdx+1).getReg(); + return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm(); +} diff --git a/llvm/test/CodeGen/Thumb2/mve-basic.ll b/llvm/test/CodeGen/Thumb2/mve-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-basic.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @vector_add_by_value(<4 x i32> %lhs, <4 x i32>%rhs) { +; CHECK-LABEL: vector_add_by_value: +; CHECK: @ %bb.0: +; CHECK-NEXT: @APP +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: bx lr + %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %result +} + +define void @vector_add_by_reference(<4 x i32>* %resultp, <4 x i32>* %lhsp, <4 x i32>* %rhsp) { +; CHECK-LABEL: vector_add_by_reference: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: @APP +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr + %lhs = load <4 x i32>, <4 x i32>* %lhsp, align 16 + %rhs = load <4 x i32>, <4 x i32>* %rhsp, align 16 + %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs) + store <4 x i32> %result, <4 x i32>* %resultp, align 16 + ret void +} diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -290,6 +290,7 @@ 'thumbv8-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), 'thumbv8m.base': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), 'thumbv8m.main': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), + 'thumbv8.1m.main': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), 'armv6': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), 'armv7': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), 'armv7-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),