Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -760,6 +760,7 @@ ATOMIC_LOAD_MAX, ATOMIC_LOAD_UMIN, ATOMIC_LOAD_UMAX, + ATOMIC_LOAD_CLR, // Masked load and store - consecutive vector load and store operations // with additional mask operand that prevents memory accesses to the Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -1235,6 +1235,7 @@ N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || + N->getOpcode() == ISD::ATOMIC_LOAD_CLR || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE || N->getOpcode() == ISD::MLOAD || @@ -1286,6 +1287,7 @@ N->getOpcode() == ISD::ATOMIC_LOAD_MAX || N->getOpcode() == ISD::ATOMIC_LOAD_UMIN || N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || + N->getOpcode() == ISD::ATOMIC_LOAD_CLR || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE; } Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -529,6 +529,8 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atomic_load_clr : SDNode<"ISD::ATOMIC_LOAD_CLR" , SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -1066,6 +1068,7 @@ defm atomic_load_max : binary_atomic_op; defm atomic_load_umin : binary_atomic_op; defm atomic_load_umax : binary_atomic_op; +defm atomic_load_clr : binary_atomic_op; defm atomic_store : binary_atomic_op; def atomic_load_8 : Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -469,6 +469,7 @@ case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_CLR: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: @@ -5480,6 +5481,7 @@ Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR || Opcode == ISD::ATOMIC_LOAD_NAND || + Opcode == ISD::ATOMIC_LOAD_CLR || Opcode == ISD::ATOMIC_LOAD_MIN || Opcode == ISD::ATOMIC_LOAD_MAX || Opcode == ISD::ATOMIC_LOAD_UMIN || Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2660,6 +2660,8 @@ break; case ISD::ATOMIC_CMP_SWAP: + // Leave IR for LSE if subtarget supports it. + if (Subtarget->hasLSE()) break; SelectCMP_SWAP(Node); return; Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -568,6 +568,7 @@ SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_LOAD_COMBINED(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -450,6 +450,18 @@ setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + if (Subtarget->hasLSE()){ + // LSE does not support these operations natively, handle them + // by breaking them in to constituent ops. + //setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); + //setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + //setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i8, Custom); + //setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i16, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + } // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. @@ -2518,6 +2530,29 @@ } } +SDValue AArch64TargetLowering::LowerATOMIC_LOAD_COMBINED(SDValue Op, + SelectionDAG &DAG) const { + AtomicSDNode *AN = cast(Op.getNode()); + + SDLoc DL(Op); + MVT VT = Op->getSimpleValueType(0); + SDValue Chain = Op->getOperand(0); + SDValue LHS = Op->getOperand(1); + SDValue RHS = Op->getOperand(2); + + if (Op.getOpcode() == ISD::ATOMIC_LOAD_SUB) { + // AtomicLoadSub breaks into negate and atomic-add. + RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); + } + if (Op.getOpcode() == ISD::ATOMIC_LOAD_AND) { + // AtomicLoadAnd breaks into invert and atomic-bit-clear. + RHS = DAG.getNOT(DL, RHS, VT); + return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); + } + llvm_unreachable("Attempt to custom lower unknown atomic op."); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2621,6 +2656,9 @@ return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + return LowerATOMIC_LOAD_COMBINED(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: @@ -10480,6 +10518,8 @@ SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: + DEBUG(dbgs() << "Invalid Opcode in replace:" << N->getOperationName()); + N->dumprFull(); llvm_unreachable("Don't know how to custom expand this"); case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); @@ -10518,6 +10558,13 @@ case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG); return; + case ISD::ATOMIC_LOAD_AND: + ReplaceReductionResults(N, Results, DAG, ISD::ATOMIC_LOAD_CLR, ISD::ATOMIC_LOAD_AND); + return; + case ISD::ATOMIC_LOAD_SUB: + ReplaceReductionResults(N, Results, DAG, ISD::ATOMIC_LOAD_SUB, ISD::ATOMIC_LOAD_ADD); + return; + } } @@ -10566,7 +10613,14 @@ TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + if (Size > 128) return AtomicExpansionKind::None; + // Nand not supported in LSE. + if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; + // Need to fix bitcast. + if (((AI->getOperation() == AtomicRMWInst::And) && (Size < 32)) || + ((AI->getOperation() == AtomicRMWInst::Sub) && (Size < 32))) return AtomicExpansionKind::LLSC; + // Leave 128 bits to LLSC. + return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( @@ -10576,6 +10630,7 @@ // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + if (Subtarget->hasLSE()) return false; return getTargetMachine().getOptLevel() != 0; } Index: lib/Target/AArch64/AArch64InstrAtomics.td =================================================================== --- lib/Target/AArch64/AArch64InstrAtomics.td +++ lib/Target/AArch64/AArch64InstrAtomics.td @@ -405,3 +405,54 @@ (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, GPR64:$newLo, GPR64:$newHi), []>, Sched<[WriteAtomic]>; + +// v8.1 Atomic instructions: +def : Pat<(atomic_load_clr_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_clr_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_clr_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_clr_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rs, GPR32:$Rt), (CASALb GPR32:$Rt, GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rs, GPR32:$Rt), (CASALh GPR32:$Rt, GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rs, GPR32:$Rt), (CASALs GPR32:$Rt, GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rs, GPR64:$Rt), (CASALd GPR64:$Rt, GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;