Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1286,7 +1286,7 @@ MemSDNode *Mem; bool IsLDG = true; - // If this is an LDG intrinsic, the address is the third operand. Its its an + // If this is an LDG intrinsic, the address is the third operand. If its an // LDG/LDU SD node (from custom vector handling), then its the second operand if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { Op1 = N->getOperand(2); @@ -1317,10 +1317,23 @@ SDValue Base, Offset, Addr; EVT EltVT = Mem->getMemoryVT(); + unsigned NumElts = 1; if (EltVT.isVector()) { + NumElts = EltVT.getVectorNumElements(); EltVT = EltVT.getVectorElementType(); } + // Build the "promoted" result VTList for the load. If we are really loading + // i8s, then the return type will be promoted to i16 since we do not expose + // 8-bit registers in NVPTX. + EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; + SmallVector InstVTs; + for (unsigned i = 0; i != NumElts; ++i) { + InstVTs.push_back(NodeVT); + } + InstVTs.push_back(MVT::Other); + SDVTList InstVTList = CurDAG->getVTList(InstVTs); + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: @@ -1461,7 +1474,7 @@ } SDValue Ops[] = { Addr, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { if (TM.is64Bit()) { @@ -1750,7 +1763,7 @@ SDValue Ops[] = { Base, Offset, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } else { if (TM.is64Bit()) { switch (N->getOpcode()) { @@ -2037,13 +2050,77 @@ } SDValue Ops[] = { Op1, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops); } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); MemRefs0[0] = Mem->getMemOperand(); cast(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); + // For automatic generation of LDG (through SelectLoad[Vector], not the + // intrinsics), we may have an extending load like: + // + // i32,ch = load t0, t7, undef:i64 + // + // Since we load an i8 value, the matching logic above will have selected an + // LDG instruction that reads i8 and stores it in an i16 register (NVPTX does + // not expose 8-bit registers): + // + // i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0 + // + // To get the correct type in this case, truncate back to i8 and then extend + // to the original load type. + EVT OrigType = N->getValueType(0); + LoadSDNode *LDSD = dyn_cast(N); + if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) { + unsigned CvtOpc = 0; + + switch (LDSD->getExtensionType()) { + default: + llvm_unreachable("An extension is required for i8 loads"); + break; + case ISD::SEXTLOAD: + switch (OrigType.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Unhandled integer load type"); + break; + case MVT::i32: + CvtOpc = NVPTX::CVT_s32_s8; + break; + case MVT::i64: + CvtOpc = NVPTX::CVT_s64_s8; + break; + } + break; + case ISD::EXTLOAD: + case ISD::ZEXTLOAD: + switch (OrigType.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Unhandled integer load type"); + break; + case MVT::i32: + CvtOpc = NVPTX::CVT_u32_u8; + break; + case MVT::i64: + CvtOpc = NVPTX::CVT_u64_u8; + break; + } + break; + } + + // For each output value, truncate to i8 (since the upper 8 bits are + // undefined) and then extend to the desired type. + for (unsigned i = 0; i != NumElts; ++i) { + SDValue Res(LD, i); + SDValue OrigVal(N, i); + + SDNode *CvtNode = + CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32)); + ReplaceUses(OrigVal, SDValue(CvtNode, 0)); + } + } + return LD; } Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -319,6 +319,16 @@ // takes a CvtMode immediate that defines the conversion mode to use. It can // be CvtNONE to omit a conversion mode. multiclass CVT_FROM_ALL { + def _s8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s8\t$dst, $src;"), []>; + def _u8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u8\t$dst, $src;"), []>; def _s16 : NVPTXInst<(outs RC:$dst), (ins Int16Regs:$src, CvtMode:$mode), Index: llvm/trunk/test/CodeGen/NVPTX/bug26185.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/bug26185.ll +++ llvm/trunk/test/CodeGen/NVPTX/bug26185.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s + +; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit +; registers in the backend, so these loads need special handling. + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +; CHECK-LABEL: ex_zext +define void @ex_zext(i8* noalias readonly %data, i32* %res) { +entry: +; CHECK: ld.global.nc.u8 + %val = load i8, i8* %data +; CHECK: cvt.u32.u8 + %valext = zext i8 %val to i32 + store i32 %valext, i32* %res + ret void +} + +; CHECK-LABEL: ex_sext +define void @ex_sext(i8* noalias readonly %data, i32* %res) { +entry: +; CHECK: ld.global.nc.u8 + %val = load i8, i8* %data +; CHECK: cvt.s32.s8 + %valext = sext i8 %val to i32 + store i32 %valext, i32* %res + ret void +} + +; CHECK-LABEL: ex_zext_v2 +define void @ex_zext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) { +entry: +; CHECK: ld.global.nc.v2.u8 + %val = load <2 x i8>, <2 x i8>* %data +; CHECK: cvt.u32.u16 + %valext = zext <2 x i8> %val to <2 x i32> + store <2 x i32> %valext, <2 x i32>* %res + ret void +} + +; CHECK-LABEL: ex_sext_v2 +define void @ex_sext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) { +entry: +; CHECK: ld.global.nc.v2.u8 + %val = load <2 x i8>, <2 x i8>* %data +; CHECK: cvt.s32.s8 + %valext = sext <2 x i8> %val to <2 x i32> + store <2 x i32> %valext, <2 x i32>* %res + ret void +} + +!nvvm.annotations = !{!0,!1,!2,!3} +!0 = !{void (i8*, i32*)* @ex_zext, !"kernel", i32 1} +!1 = !{void (i8*, i32*)* @ex_sext, !"kernel", i32 1} +!2 = !{void (<2 x i8>*, <2 x i32>*)* @ex_zext_v2, !"kernel", i32 1} +!3 = !{void (<2 x i8>*, <2 x i32>*)* @ex_sext_v2, !"kernel", i32 1}