Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -465,7 +465,7 @@ return Changed; } -bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { @@ -475,7 +475,23 @@ Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); - Value *WidenLoad = Builder.CreateLoad(BitCast); + LoadInst *WidenLoad = Builder.CreateLoad(BitCast); + WidenLoad->copyMetadata(I); + + // If we have range metadata, we need to convert the type. + if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { + ConstantInt *Lower = + mdconst::extract(Range->getOperand(0)); + + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), + // Don't make assumptions about the high bits. + ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0xffffffff)) + }; + + WidenLoad->setMetadata(LLVMContext::MD_range, + MDNode::get(Mod->getContext(), LowAndHigh)); + } int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); Type *IntNTy = Builder.getIntNTy(TySize); Index: test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll =================================================================== --- test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll +++ test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll @@ -189,4 +189,50 @@ ret void } +; OPT-LABEL: @constant_load_i16_align4_range( +; OPT: load i32, i32 addrspace(4)* %1, !range !0 +define amdgpu_kernel void @constant_load_i16_align4_range(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in, align 4, !range !0 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @constant_load_i16_align4_range_max( +; OPT: load i32, i32 addrspace(4)* %1, !range !0 +define amdgpu_kernel void @constant_load_i16_align4_range_max(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in, align 4, !range !1 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @constant_load_i16_align4_complex_range( +; OPT: load i32, i32 addrspace(4)* %1, !range !1 +define amdgpu_kernel void @constant_load_i16_align4_complex_range(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in, align 4, !range !2 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + + +; OPT-LABEL: @constant_load_i16_align4_invariant +; OPT: load i32, i32 addrspace(4)* %1, !invariant.load !2 +define amdgpu_kernel void @constant_load_i16_align4_invariant(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in, align 4, !invariant.load !3 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + attributes #0 = { nounwind } + +; OPT: !0 = !{i32 5, i32 -1} +; OPT: !1 = !{i32 8, i32 -1} +; OPT: !2 = !{} + +!0 = !{i16 5, i16 500} +!1 = !{i16 5, i16 -1} +!2 = !{i16 8, i16 12, i16 42, i16 99} +!3 = !{}