Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -81,10 +82,12 @@ switch (N->getOpcode()) { case ISD::LOAD: + case ISD::ATOMIC_LOAD: if (tryLoad(N)) return; break; case ISD::STORE: + case ISD::ATOMIC_STORE: if (tryStore(N)) return; break; @@ -834,17 +837,27 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDLoc dl(N); - LoadSDNode *LD = cast(N); + MemSDNode *LD = cast(N); + assert(LD->readMem() && "Expected load"); + LoadSDNode *PlainLoad = dyn_cast(N); EVT LoadedVT = LD->getMemoryVT(); SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec - if (LD->isIndexed()) + if (PlainLoad && PlainLoad->isIndexed()) return false; if (!LoadedVT.isSimple()) return false; + AtomicOrdering Ordering = LD->getOrdering(); + // In order to lower atomic loads with stronger guarantees we would need to + // use load.acquire or insert fences. However these features were only added + // with PTX ISA 6.0 / sm_70. + // TODO: Check if we can actually use the new instructions and implement them. + if (isStrongerThanMonotonic(Ordering)) + return false; + // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { @@ -855,8 +868,9 @@ CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool isVolatile = LD->isVolatile(); + // - .volatile is only available for .global and .shared + // - .volatile has the same memory synchronization semantics as .relaxed.sys + bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) @@ -882,7 +896,7 @@ fromTypeWidth = 32; } - if ((LD->getExtensionType() == ISD::SEXTLOAD)) + if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) fromType = NVPTX::PTXLdStInstCode::Signed; else if (ScalarVT.isFloatingPoint()) // f16 uses .b16 as its storage type. @@ -1691,25 +1705,38 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { SDLoc dl(N); - StoreSDNode *ST = cast(N); + MemSDNode *ST = cast(N); + assert(ST->writeMem() && "Expected store"); + StoreSDNode *PlainStore = dyn_cast(N); + AtomicSDNode *AtomicStore = dyn_cast(N); + assert((PlainStore || AtomicStore) && "Expected store"); EVT StoreVT = ST->getMemoryVT(); SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec - if (ST->isIndexed()) + if (PlainStore && PlainStore->isIndexed()) return false; if (!StoreVT.isSimple()) return false; + AtomicOrdering Ordering = ST->getOrdering(); + // In order to lower atomic loads with stronger guarantees we would need to + // use store.release or insert fences. However these features were only added + // with PTX ISA 6.0 / sm_70. + // TODO: Check if we can actually use the new instructions and implement them. + if (isStrongerThanMonotonic(Ordering)) + return false; + // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(ST); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool isVolatile = ST->isVolatile(); + // - .volatile is only available for .global and .shared + // - .volatile has the same memory synchronization semantics as .relaxed.sys + bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) @@ -1739,41 +1766,53 @@ toType = NVPTX::PTXLdStInstCode::Unsigned; // Create the machine instruction DAG - SDValue Chain = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue N2 = N->getOperand(2); + SDValue Chain = ST->getChain(); + SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); + SDValue BasePtr = ST->getBasePtr(); SDValue Addr; SDValue Offset, Base; Optional Opcode; - MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy; + MVT::SimpleValueType SourceVT = + Value.getNode()->getSimpleValueType(0).SimpleTy; - if (SelectDirectAddr(N2, Addr)) { + if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar, NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr, - Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Addr, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) - : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 + ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) + : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi, NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, - Offset, Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) - : SelectADDRri(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 + ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) + : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { if (PointerSize == 64) Opcode = pickOpcodeForVT( SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, @@ -1787,10 +1826,15 @@ if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, - Offset, Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); } else { if (PointerSize == 64) @@ -1806,10 +1850,14 @@ NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2, - Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + BasePtr, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); } Index: llvm/trunk/test/CodeGen/NVPTX/load-store.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/load-store.ll +++ llvm/trunk/test/CodeGen/NVPTX/load-store.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +; CHECK-LABEL: plain +define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr { + ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, i8* %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, i8* %a + + ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, i16* %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, i16* %b + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, i32* %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, i32* %c + + ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, i64* %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, i64* %d + + ret void +} + +; CHECK-LABEL: volatile +define void @volatile(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, i8* %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, i8* %a + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, i16* %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, i16* %b + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, i32* %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, i32* %c + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, i64* %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, i64* %d + + ret void +} + +; CHECK-LABEL: monotonic +define void @monotonic(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, i8* %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, i8* %a monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, i16* %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, i16* %b monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, i32* %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, i32* %c monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, i64* %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, i64* %d monotonic, align 8 + + ret void +}