Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.h =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -93,6 +93,7 @@ bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; + unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned); }; } // end namespace llvm Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2062,61 +2062,33 @@ // // i32,ch = load t0, t7, undef:i64 // - // Since we load an i8 value, the matching logic above will have selected an - // LDG instruction that reads i8 and stores it in an i16 register (NVPTX does - // not expose 8-bit registers): - // - // i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0 - // - // To get the correct type in this case, truncate back to i8 and then extend - // to the original load type. - EVT OrigType = N->getValueType(0); - LoadSDNode *LDSD = dyn_cast(N); - if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) { - unsigned CvtOpc = 0; - - switch (LDSD->getExtensionType()) { - default: - llvm_unreachable("An extension is required for i8 loads"); - break; - case ISD::SEXTLOAD: - switch (OrigType.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("Unhandled integer load type"); - break; - case MVT::i32: - CvtOpc = NVPTX::CVT_s32_s8; - break; - case MVT::i64: - CvtOpc = NVPTX::CVT_s64_s8; - break; - } - break; - case ISD::EXTLOAD: - case ISD::ZEXTLOAD: - switch (OrigType.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("Unhandled integer load type"); - break; - case MVT::i32: - CvtOpc = NVPTX::CVT_u32_u8; - break; - case MVT::i64: - CvtOpc = NVPTX::CVT_u64_u8; - break; - } - break; - } + // In this case, the matching logic above will select a load for the original + // memory type (in this case, i8) and our types will not match (the node needs + // to return an i32 in this case). Our LDG/LDU nodes do not support the + // concept of sign-/zero-extension, so emulate it here by adding an explicit + // CVT instruction. Ptxas should clean up any redundancies here. - // For each output value, truncate to i8 (since the upper 8 bits are - // undefined) and then extend to the desired type. + EVT OrigType = N->getValueType(0); + LoadSDNode *LdNode = dyn_cast(N); + + if (OrigType != EltVT && LdNode) { + // We have an extending-load. The instruction we selected operates on the + // smaller type, but the SDNode we are replacing has the larger type. We + // need to emit a CVT to make the types match. + bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD; + unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(), + EltVT.getSimpleVT(), IsSigned); + + // For each output value, apply the manual sign/zero-extension and make sure + // all users of the load go through that CVT. for (unsigned i = 0; i != NumElts; ++i) { SDValue Res(LD, i); SDValue OrigVal(N, i); SDNode *CvtNode = CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, - CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32)); + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, + DL, MVT::i32)); ReplaceUses(OrigVal, SDValue(CvtNode, 0)); } } @@ -5199,3 +5171,82 @@ } return true; } + +/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a +/// conversion from \p SrcTy to \p DestTy. +unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, + bool IsSigned) { + unsigned CvtOpc = 0; + + switch (SrcTy.SimpleTy) { + default: + llvm_unreachable("Unhandled source type"); + break; + case MVT::i8: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + break; + case MVT::i16: + CvtOpc = IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; + break; + case MVT::i32: + CvtOpc = IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; + break; + case MVT::i64: + CvtOpc = IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; + break; + } + break; + case MVT::i16: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + break; + case MVT::i8: + CvtOpc = IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; + break; + case MVT::i32: + CvtOpc = IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; + break; + case MVT::i64: + CvtOpc = IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; + break; + } + break; + case MVT::i32: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + break; + case MVT::i8: + CvtOpc = IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; + break; + case MVT::i16: + CvtOpc = IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; + break; + case MVT::i64: + CvtOpc = IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; + break; + } + break; + case MVT::i64: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + break; + case MVT::i8: + CvtOpc = IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; + break; + case MVT::i16: + CvtOpc = IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; + break; + case MVT::i32: + CvtOpc = IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; + break; + } + break; + } + + return CvtOpc; +} Index: lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXInstrInfo.td +++ lib/Target/NVPTX/NVPTXInstrInfo.td @@ -377,6 +377,8 @@ } // Generate cvts from all types to all types. + defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; + defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; Index: test/CodeGen/NVPTX/bug26185-2.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/bug26185-2.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s + +; Verify that we correctly emit code for extending ldg/ldu. We do not expose +; extending variants in the backend, but the ldg/ldu selection code may pick +; extending loads as candidates. We do want to support this, so make sure we +; emit the necessary cvt.* instructions to implement the extension and let ptxas +; emit the real extending loads. + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: spam +define ptx_kernel void @spam(i8 addrspace(1)* noalias nocapture readonly %arg, i8 addrspace(1)* noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 { +bb: + %tmp = bitcast i8 addrspace(1)* %arg to i16 addrspace(1)* + %tmp4 = bitcast i8 addrspace(1)* %arg1 to i64 addrspace(1)* + %tmp5 = add nsw i64 %arg3, 8 + %tmp6 = getelementptr i16, i16 addrspace(1)* %tmp, i64 %tmp5 +; CHECK: ld.global.nc.u16 + %tmp7 = load i16, i16 addrspace(1)* %tmp6, align 2 +; CHECK: cvt.s32.s16 + %tmp8 = sext i16 %tmp7 to i64 + %tmp9 = mul nsw i64 %tmp8, %tmp8 + %tmp10 = load i64, i64 addrspace(1)* %tmp4, align 8 + %tmp11 = add nsw i64 %tmp9, %tmp10 + store i64 %tmp11, i64 addrspace(1)* %tmp4, align 8 + ret void +} + +attributes #0 = { norecurse nounwind "polly.skip.fn" } + +!nvvm.annotations = !{!0} + +!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64)* @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}