Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -8586,7 +8586,7 @@ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || + if ((Src->hasOneUse() && TLI.ShrinkDemandedConstant(Src, Demanded, TLO)) || TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -281,3 +281,23 @@ store float %cvt, float addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}cvt_ubyte0_or_multiuse: +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]], +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x80000001, [[LOADREG]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[OR]] +; GCN: v_add_f32_e32 [[RES:v[0-9]+]], [[OR]], [[CONV]] +; GCN: buffer_store_dword [[RES]], +define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid + %load = load i32, i32 addrspace(1)* %gep + %or = or i32 %load, -2147483647 + %and = and i32 %or, 255 + %uitofp = uitofp i32 %and to float + %cast = bitcast i32 %or to float + %add = fadd float %cast, %uitofp + store float %add, float addrspace(1)* %out + ret void +}