Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1226,8 +1226,11 @@ MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); - if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || - !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + if (!BaseLoDef || !BaseHiDef || + (BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e32 && + BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64) || + (BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e32 && + BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)) return; const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); Index: test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -75,15 +75,15 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off entry: @@ -201,12 +201,12 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -483,3 +483,75 @@ store i64 %add, i64 addrspace(1)* %buffer_head, align 8 ret void } + +define amdgpu_kernel void @clmem_copy_simplified(i8 addrspace(1)* nocapture %arg) { +; GFX9-LABEL: clmem_copy_simplified: +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], [[BASE:v\[[0-9]+:[0-9]+\]]], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], [[BASE]], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], [[BASE]], off +bb: + %tmp = tail call i64 @_Z13get_global_idj(i32 0) + %tmp1 = and i64 %tmp, 255 + %tmp2 = shl i64 %tmp, 17 + %tmp3 = and i64 %tmp2, 4261412864 + %tmp4 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp3 + %tmp5 = bitcast i8 addrspace(1)* %tmp4 to i64 addrspace(1)* + %tmp6 = getelementptr inbounds i64, i64 addrspace(1)* %tmp5, i64 %tmp1 + %tmp7 = getelementptr inbounds i8, i8 addrspace(1)* %tmp4, i64 16777216 + %tmp8 = bitcast i8 addrspace(1)* %tmp7 to i64 addrspace(1)* + %tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %tmp8, i64 %tmp1 + br label %bb13 + +bb10: ; preds = %bb15 + %tmp11 = add nsw i32 %tmp14, -1 + %tmp12 = icmp eq i32 %tmp14, 0 + br i1 %tmp12, label %bb98, label %bb13 + +bb13: ; preds = %bb10, %bb + %tmp14 = phi i32 [ 127, %bb ], [ %tmp11, %bb10 ] + br label %bb15 + +bb15: ; preds = %bb15, %bb13 + %tmp16 = phi i32 [ 0, %bb13 ], [ %tmp96, %bb15 ] + %tmp17 = zext i32 %tmp16 to i64 + %tmp18 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp17 + %tmp19 = load i64, i64 addrspace(1)* %tmp18, align 8 + %tmp20 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp17 + store i64 %tmp19, i64 addrspace(1)* %tmp20, align 8 + %tmp21 = or i32 %tmp16, 256 + %tmp22 = zext i32 %tmp21 to i64 + %tmp23 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp22 + %tmp24 = load i64, i64 addrspace(1)* %tmp23, align 8 + %tmp25 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp22 + store i64 %tmp24, i64 addrspace(1)* %tmp25, align 8 + %tmp26 = or i32 %tmp16, 512 + %tmp27 = zext i32 %tmp26 to i64 + %tmp28 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp27 + %tmp29 = load i64, i64 addrspace(1)* %tmp28, align 8 + %tmp30 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp27 + store i64 %tmp29, i64 addrspace(1)* %tmp30, align 8 + %tmp31 = or i32 %tmp16, 768 + %tmp32 = zext i32 %tmp31 to i64 + %tmp33 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp32 + %tmp34 = load i64, i64 addrspace(1)* %tmp33, align 8 + %tmp35 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp32 + store i64 %tmp34, i64 addrspace(1)* %tmp35, align 8 + %tmp36 = or i32 %tmp16, 1024 + %tmp37 = zext i32 %tmp36 to i64 + %tmp38 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp37 + %tmp39 = load i64, i64 addrspace(1)* %tmp38, align 8 + %tmp40 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp37 + store i64 %tmp39, i64 addrspace(1)* %tmp40, align 8 + %tmp41 = or i32 %tmp16, 1280 + %tmp42 = zext i32 %tmp41 to i64 + %tmp43 = getelementptr inbounds i64, i64 addrspace(1)* %tmp6, i64 %tmp42 + %tmp44 = load i64, i64 addrspace(1)* %tmp43, align 8 + %tmp45 = getelementptr inbounds i64, i64 addrspace(1)* %tmp9, i64 %tmp42 + store i64 %tmp44, i64 addrspace(1)* %tmp45, align 8 + %tmp96 = add nuw nsw i32 %tmp16, 4096 + %tmp97 = icmp ult i32 %tmp96, 2097152 + br i1 %tmp97, label %bb15, label %bb10 + +bb98: ; preds = %bb10 + ret void +}