diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -97,7 +97,7 @@ bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; - static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned); + static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N); }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1599,13 +1599,13 @@ EVT OrigType = N->getValueType(0); LoadSDNode *LdNode = dyn_cast(N); - if (OrigType != EltVT && LdNode) { + if (OrigType != EltVT && + (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { // We have an extending-load. The instruction we selected operates on the // smaller type, but the SDNode we are replacing has the larger type. We // need to emit a CVT to make the types match. - bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD; - unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(), - EltVT.getSimpleVT(), IsSigned); + unsigned CvtOpc = + GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); // For each output value, apply the manual sign/zero-extension and make sure // all users of the load go through that CVT. @@ -3601,7 +3601,8 @@ /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a /// conversion from \p SrcTy to \p DestTy. unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, - bool IsSigned) { + LoadSDNode *LdNode) { + bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; switch (SrcTy.SimpleTy) { default: llvm_unreachable("Unhandled source type"); @@ -3649,5 +3650,14 @@ case MVT::i32: return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; } + case MVT::f16: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::f32: + return NVPTX::CVT_f32_f16; + case MVT::f64: + return NVPTX::CVT_f64_f16; + } } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5146,7 +5146,8 @@ Align Alignment = LD->getAlign(); auto &TD = DAG.getDataLayout(); - Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext())); + Align PrefAlign = + TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); if (Alignment < PrefAlign) { // This load is not sufficiently aligned, so bail out and let this vector // load be scalarized. Note that we may still be able to emit smaller diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -97,5 +97,58 @@ ret void } +; CHECK-LABEL: extv8f16_global_a16( +define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 { +; CHECK: ld.global.v4.b16 {%f +; CHECK: ld.global.v4.b16 {%f + %v = load <8 x half>, ptr addrspace(1) %src, align 16 + %ext = fpext <8 x half> %v to <8 x float> +; CHECK: st.global.v4.f32 +; CHECK: st.global.v4.f32 + store <8 x float> %ext, ptr addrspace(1) %dst, align 16 + ret void +} + +; CHECK-LABEL: extv8f16_global_a4( +define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 { +; CHECK: ld.global.v2.b16 {%f +; CHECK: ld.global.v2.b16 {%f +; CHECK: ld.global.v2.b16 {%f +; CHECK: ld.global.v2.b16 {%f + %v = load <8 x half>, ptr addrspace(1) %src, align 4 + %ext = fpext <8 x half> %v to <8 x float> +; CHECK: st.global.v4.f32 +; CHECK: st.global.v4.f32 + store <8 x float> %ext, ptr addrspace(1) %dst, align 16 + ret void +} + + +; CHECK-LABEL: extv8f16_generic_a16( +define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 { +; CHECK: ld.v4.b16 {%f +; CHECK: ld.v4.b16 {%f + %v = load <8 x half>, ptr %src, align 16 + %ext = fpext <8 x half> %v to <8 x float> +; CHECK: st.v4.f32 +; CHECK: st.v4.f32 + store <8 x float> %ext, ptr %dst, align 16 + ret void +} + +; CHECK-LABEL: extv8f16_generic_a4( +define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 { +; CHECK: ld.v2.b16 {%f +; CHECK: ld.v2.b16 {%f +; CHECK: ld.v2.b16 {%f +; CHECK: ld.v2.b16 {%f + %v = load <8 x half>, ptr %src, align 4 + %ext = fpext <8 x half> %v to <8 x float> +; CHECK: st.v4.f32 +; CHECK: st.v4.f32 + store <8 x float> %ext, ptr %dst, align 16 + ret void +} + !1 = !{i32 0, i32 64}