Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -974,10 +974,6 @@ const NVPTXTargetMachine &NTM = static_cast(TM); if (NTM.getDrvInterface() == NVPTX::NVCL) O << ", texmode_independent"; - else { - if (!STI.hasDouble()) - O << ", map_f64_to_f32"; - } if (MAI->doesSupportDebugInformation()) O << ", debug"; Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -417,20 +417,13 @@ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); - if (STI.hasROT64()) { - setOperationAction(ISD::ROTL, MVT::i64, Legal); - setOperationAction(ISD::ROTR, MVT::i64, Legal); - } else { - setOperationAction(ISD::ROTL, MVT::i64, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); - } - if (STI.hasROT32()) { - setOperationAction(ISD::ROTL, MVT::i32, Legal); - setOperationAction(ISD::ROTR, MVT::i32, Legal); - } else { - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTR, MVT::i32, Expand); - } + // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs + // that don't have h/w rotation we lower them to multi-instruction assembly. + // See ROT*_sw in NVPTXIntrInfo.td + setOperationAction(ISD::ROTL, MVT::i64, Legal); + setOperationAction(ISD::ROTR, MVT::i64, Legal); + setOperationAction(ISD::ROTL, MVT::i32, Legal); + setOperationAction(ISD::ROTR, MVT::i32, Legal); setOperationAction(ISD::ROTL, MVT::i16, Expand); setOperationAction(ISD::ROTR, MVT::i16, Expand); Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -111,28 +111,14 @@ //===----------------------------------------------------------------------===// -def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">; -def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">; -def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">; -def useAtomRedG32forGen32 : - Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">; -def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">; -def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">; -def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">; -def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">; -def useAtomRedG64forGen64 : - Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; -def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; -def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; -def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; @@ -961,13 +947,12 @@ (ins f32imm:$a, Float32Regs:$b), "rcp.rn.ftz.f32 \t$dst, $b;", [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20, doF32FTZ]>; + Requires<[doF32FTZ]>; def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.rn.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; // // F32 Accurate division // @@ -976,25 +961,23 @@ (ins Float32Regs:$a, Float32Regs:$b), "div.rn.ftz.f32 \t$dst, $a, $b;", [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; + Requires<[doF32FTZ]>; def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.rn.ftz.f32 \t$dst, $a, $b;", [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; + Requires<[doF32FTZ]>; def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[reqPTX20]>; + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; // // FMA Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1025,18 +1025,19 @@ multiclass F_ATOMIC_2_imp { + Operand IMMType, SDNode IMM, list Pred> { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"), [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>, - Requires<[Pred]>; + Requires; def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""), [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>, - Requires<[Pred]>; + Requires; } multiclass F_ATOMIC_2 { + string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, + list Pred = []> { defm p32 : F_ATOMIC_2_imp; defm p64 : F_ATOMIC_2_imp { + Operand IMMType, list Pred> { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b), !strconcat( "{{ \n\t", @@ -1055,11 +1056,11 @@ "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t", "}}"), [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>, - Requires<[Pred]>; + Requires; } multiclass F_ATOMIC_2_NEG { + list Pred = []> { defm p32: F_ATOMIC_2_NEG_imp ; defm p64: F_ATOMIC_2_NEG_imp { + Operand IMMType, list Pred> { def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, regclass:$c), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>, - Requires<[Pred]>; + Requires; def imm1 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, regclass:$c), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>, - Requires<[Pred]>; + Requires; def imm2 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b, IMMType:$c), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""), [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>, - Requires<[Pred]>; + Requires; def imm3 : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b, IMMType:$c), !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"), [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>, - Requires<[Pred]>; + Requires; } multiclass F_ATOMIC_3 { + string OpcStr, PatFrag IntOp, Operand IMMType, list Pred = []> { defm p32 : F_ATOMIC_3_imp; defm p64 : F_ATOMIC_3_imp; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; + atomic_load_add_32_g, i32imm, imm>; defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2; + atomic_load_add_32_s, i32imm, imm>; defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2; + atomic_load_add_32_gen, i32imm, imm>; defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2; + ".add", atomic_load_add_32_gen, i32imm, imm>; defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2; + atomic_load_add_64_g, i64imm, imm>; defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2; + atomic_load_add_64_s, i64imm, imm>; defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2; + atomic_load_add_64_gen, i64imm, imm>; defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2; + ".add", atomic_load_add_64_gen, i64imm, imm>; defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2; + atomic_load_add_f32_g, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2; + atomic_load_add_f32_s, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2; + atomic_load_add_f32_gen, f32imm, fpimm>; defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; + atomic_load_add_f64_g, f64imm, fpimm, [hasAtomAddF64]>; defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; + atomic_load_add_f64_s, f64imm, fpimm, [hasAtomAddF64]>; defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; + atomic_load_add_f64_gen, f64imm, fpimm, [hasAtomAddF64]>; // atom_sub @@ -1177,21 +1178,21 @@ (atomic_load_sub_64 node:$a, node:$b)>; defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG; + atomic_load_sub_32_g, i32imm>; defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG; + atomic_load_sub_64_g, i64imm>; defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG; + atomic_load_sub_32_gen, i32imm>; defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG; + ".add", atomic_load_sub_32_gen, i32imm>; defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG; + atomic_load_sub_32_s, i32imm>; defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG; + atomic_load_sub_64_s, i64imm>; defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG; + atomic_load_sub_64_gen, i64imm>; defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG; + ".add", atomic_load_sub_64_gen, i64imm>; // atom_swap @@ -1209,21 +1210,21 @@ (atomic_swap_64 node:$a, node:$b)>; defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2; + atomic_swap_32_g, i32imm, imm>; defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2; + atomic_swap_32_s, i32imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2; + atomic_swap_32_gen, i32imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2; + ".exch", atomic_swap_32_gen, i32imm, imm>; defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2; + atomic_swap_64_g, i64imm, imm>; defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2; + atomic_swap_64_s, i64imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2; + atomic_swap_64_gen, i64imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2; + ".exch", atomic_swap_64_gen, i64imm, imm>; // atom_max @@ -1253,37 +1254,37 @@ (atomic_load_umax_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2; + ".max", atomic_load_max_32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2; + ".max", atomic_load_max_32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2; + atomic_load_max_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2; + ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2; + ".max", atomic_load_max_64_g, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2; + ".max", atomic_load_max_64_s, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2; + atomic_load_max_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2; + ".s64", ".max", atomic_load_max_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2; + ".max", atomic_load_umax_32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2; + ".max", atomic_load_umax_32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2; + atomic_load_umax_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2; + ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2; + ".max", atomic_load_umax_64_g, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2; + ".max", atomic_load_umax_64_s, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2; + atomic_load_umax_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2; + ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm>; // atom_min @@ -1313,37 +1314,37 @@ (atomic_load_umin_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2; + ".min", atomic_load_min_32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2; + ".min", atomic_load_min_32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2; + atomic_load_min_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2; + ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2; + ".min", atomic_load_min_64_g, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2; + ".min", atomic_load_min_64_s, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2; + atomic_load_min_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2; + ".s64", ".min", atomic_load_min_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2; + ".min", atomic_load_umin_32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2; + ".min", atomic_load_umin_32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2; + atomic_load_umin_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2; + ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2; + ".min", atomic_load_umin_64_g, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2; + ".min", atomic_load_umin_64_s, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2; + atomic_load_umin_64_gen, i64imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2; + ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm>; // atom_inc atom_dec @@ -1361,21 +1362,21 @@ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2; + atomic_load_inc_32_g, i32imm, imm>; defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2; + atomic_load_inc_32_s, i32imm, imm>; defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2; + atomic_load_inc_32_gen, i32imm, imm>; defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2; + ".inc", atomic_load_inc_32_gen, i32imm, imm>; defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2; + atomic_load_dec_32_g, i32imm, imm>; defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2; + atomic_load_dec_32_s, i32imm, imm>; defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2; + atomic_load_dec_32_gen, i32imm, imm>; defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2; + ".dec", atomic_load_dec_32_gen, i32imm, imm>; // atom_and @@ -1393,21 +1394,21 @@ (atomic_load_and_64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2; + atomic_load_and_32_g, i32imm, imm>; defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2; + atomic_load_and_32_s, i32imm, imm>; defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2; + atomic_load_and_32_gen, i32imm, imm>; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2; + ".and", atomic_load_and_32_gen, i32imm, imm>; defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2; + atomic_load_and_64_g, i64imm, imm>; defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2; + atomic_load_and_64_s, i64imm, imm>; defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2; + atomic_load_and_64_gen, i64imm, imm>; defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2; + ".and", atomic_load_and_64_gen, i64imm, imm>; // atom_or @@ -1425,21 +1426,21 @@ (atomic_load_or_64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2; + atomic_load_or_32_g, i32imm, imm>; defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2; + atomic_load_or_32_gen, i32imm, imm>; defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2; + ".or", atomic_load_or_32_gen, i32imm, imm>; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2; + atomic_load_or_32_s, i32imm, imm>; defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2; + atomic_load_or_64_g, i64imm, imm>; defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2; + atomic_load_or_64_gen, i64imm, imm>; defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2; + ".or", atomic_load_or_64_gen, i64imm, imm>; defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2; + atomic_load_or_64_s, i64imm, imm>; // atom_xor @@ -1457,21 +1458,21 @@ (atomic_load_xor_64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2; + atomic_load_xor_32_g, i32imm, imm>; defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2; + atomic_load_xor_32_s, i32imm, imm>; defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2; + atomic_load_xor_32_gen, i32imm, imm>; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2; + ".xor", atomic_load_xor_32_gen, i32imm, imm>; defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2; + atomic_load_xor_64_g, i64imm, imm>; defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2; + atomic_load_xor_64_s, i64imm, imm>; defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2; + atomic_load_xor_64_gen, i64imm, imm>; defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2; + ".xor", atomic_load_xor_64_gen, i64imm, imm>; // atom_cas @@ -1489,21 +1490,21 @@ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3; + atomic_cmp_swap_32_g, i32imm>; defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3; + atomic_cmp_swap_32_s, i32imm>; defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3; + atomic_cmp_swap_32_gen, i32imm>; defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3; + ".cas", atomic_cmp_swap_32_gen, i32imm>; defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3; + atomic_cmp_swap_64_g, i64imm>; defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3; + atomic_cmp_swap_64_s, i64imm>; defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3; + atomic_cmp_swap_64_gen, i64imm>; defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3; + ".cas", atomic_cmp_swap_64_gen, i64imm>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -1654,7 +1655,7 @@ defm _u32 : ATOM2S_impl; defm _u64 : ATOM2S_impl; defm _f32 : ATOM2S_impl; + []>; defm _f64 : ATOM2S_impl; } @@ -1936,55 +1937,18 @@ multiclass NG_TO_G { def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), !strconcat("cvta.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>, - Requires<[hasGenericLdSt]>; + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), !strconcat("cvta.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>, - Requires<[hasGenericLdSt]>; - -// @TODO: Are these actually needed? I believe global addresses will be copied -// to register values anyway. - /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src), - !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")), - [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>, - Requires<[hasGenericLdSt]>; - def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src), - !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")), - [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>, - Requires<[hasGenericLdSt]>;*/ - - def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; - def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; - -// @TODO: Are these actually needed? I believe global addresses will be copied -// to register values anyway. - /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>; - def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/ } multiclass G_TO_NG { def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>, - Requires<[hasGenericLdSt]>; + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>, - Requires<[hasGenericLdSt]>; - def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; - def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; } Index: llvm/lib/Target/NVPTX/NVPTXSubtarget.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -73,33 +73,13 @@ return &TSInfo; } - bool hasBrkPt() const { return SmVersion >= 11; } - bool hasAtomRedG32() const { return SmVersion >= 11; } - bool hasAtomRedS32() const { return SmVersion >= 12; } - bool hasAtomRedG64() const { return SmVersion >= 12; } - bool hasAtomRedS64() const { return SmVersion >= 20; } - bool hasAtomRedGen32() const { return SmVersion >= 20; } - bool hasAtomRedGen64() const { return SmVersion >= 20; } - bool hasAtomAddF32() const { return SmVersion >= 20; } bool hasAtomAddF64() const { return SmVersion >= 60; } bool hasAtomScope() const { return HasAtomScope; } bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } - bool hasVote() const { return SmVersion >= 12; } - bool hasDouble() const { return SmVersion >= 13; } - bool reqPTX20() const { return SmVersion >= 20; } - bool hasF32FTZ() const { return SmVersion >= 20; } - bool hasFMAF32() const { return SmVersion >= 20; } - bool hasFMAF64() const { return SmVersion >= 13; } bool hasLDG() const { return SmVersion >= 32; } bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); } - bool hasGenericLdSt() const { return SmVersion >= 20; } inline bool hasHWROT32() const { return SmVersion >= 32; } - inline bool hasSWROT32() const { - return ((SmVersion >= 20) && (SmVersion < 32)); - } - inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); } - inline bool hasROT64() const { return SmVersion >= 20; } bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool allowFP16Math() const;