Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1709,6 +1709,9 @@ if (!I->isCopy() && !I->isRegSequence()) return false; Register DstReg = I->getOperand(0).getReg(); + // Physical registers may have more than one instruction definitions + if (DstReg.isPhysical()) + return false; if (TRI->isAGPR(*MRI, DstReg)) continue; MoveRegs.push_back(DstReg); Index: llvm/test/CodeGen/AMDGPU/fold-vgpr-phyreg.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fold-vgpr-phyreg.mir @@ -0,0 +1,44 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-fold-operands -verify-machineinstrs | FileCheck %s + +--- +name: lshl_add_u64_gep +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: lshl_add_u64_gep + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 2, [[REG_SEQUENCE]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY2]], [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD killed [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr3 + %1:vgpr_32 = COPY $vgpr2 + %2:vgpr_32 = COPY $vgpr1 + %3:vgpr_32 = COPY $vgpr0 + %4:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 + %5:sreg_32 = S_MOV_B32 2 + %6:vreg_64_align2 = V_LSHLREV_B64_e64 killed %5, %4, implicit $exec + %7:vgpr_32 = COPY %3 + %8:vgpr_32 = COPY %6.sub0 + %9:vgpr_32 = COPY %2 + %10:vgpr_32 = COPY %6.sub1 + %11:vgpr_32, %12:sreg_64_xexec = V_ADD_CO_U32_e64 %7, %8, 0, implicit $exec + %13:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 %9, %10, killed %12, 0, implicit $exec + %15:vreg_64_align2 = REG_SEQUENCE %11, %subreg.sub0, %13, %subreg.sub1 + %16:vgpr_32 = FLAT_LOAD_DWORD killed %15, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = COPY %16 + SI_RETURN implicit $vgpr0 + +... Index: llvm/test/CodeGen/AMDGPU/swdev373493.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck %s + +@global = external protected addrspace(4) externally_initialized global [4096 x i64], align 16 + +define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6) unnamed_addr align 2 { +; CHECK-LABEL: bar: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v15, v12 +; CHECK-NEXT: v_mov_b32_e32 v14, v11 +; CHECK-NEXT: v_mov_b32_e32 v13, v10 +; CHECK-NEXT: v_mov_b32_e32 v12, v9 +; CHECK-NEXT: v_mov_b32_e32 v11, v8 +; CHECK-NEXT: v_mov_b32_e32 v10, v7 +; CHECK-NEXT: v_mov_b32_e32 v9, v6 +; CHECK-NEXT: v_mov_b32_e32 v8, v5 +; CHECK-NEXT: v_mov_b32_e32 v7, v4 +; CHECK-NEXT: v_mov_b32_e32 v6, v3 +; CHECK-NEXT: s_cmp_lt_i32 s4, 3 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %LeafBlock +; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 +; CHECK-NEXT: ; %bb.2: ; %bb7 +; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, global@rel32@lo+1948 +; CHECK-NEXT: s_addc_u32 s17, s17, global@rel32@hi+1956 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s17 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, eggs@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, eggs@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[18:19] +; CHECK-NEXT: .LBB0_3: ; %LeafBlock1 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %bb8 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mov_b32_e32 v4, v8 +; CHECK-NEXT: v_mov_b32_e32 v5, v9 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v7, v11 +; CHECK-NEXT: v_mov_b32_e32 v8, v12 +; CHECK-NEXT: v_mov_b32_e32 v9, v13 +; CHECK-NEXT: v_mov_b32_e32 v10, v14 +; CHECK-NEXT: v_mov_b32_e32 v11, v15 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, quux@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, quux@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: .LBB0_5: ; %bb9 +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + switch i32 undef, label %bb9 [ + i32 3, label %bb8 + i32 1, label %bb7 + ] + +bb7: ; preds = %bb + %tmp = load ptr, ptr undef, align 8 + tail call fastcc void @eggs(ptr noundef addrspacecast (ptr addrspace(4) getelementptr inbounds ([4096 x i64], ptr addrspace(4) @global, i64 0, i64 243) to ptr), ptr %tmp, ptr undef, ptr noundef nonnull align 8 dereferenceable(24) %arg2, ptr noundef %arg3, ptr noundef %arg4, ptr noundef %arg5) + br label %bb9 + +bb8: ; preds = %bb + tail call fastcc void @quux(ptr noundef nonnull align 8 dereferenceable(24) %arg1, ptr noundef nonnull align 8 dereferenceable(24) %arg2, ptr noundef %arg3, ptr noundef %arg4, ptr noundef %arg5, ptr noundef nonnull align 8 dereferenceable(8) %arg6) + br label %bb9 + +bb9: ; preds = %bb8, %bb7, %bb + ret void +} + +declare dso_local fastcc void @eggs(ptr, ptr, ptr, ptr, ptr, ptr, ptr) unnamed_addr align 2 + +declare dso_local fastcc void @quux(ptr, ptr, ptr, ptr, ptr, ptr) unnamed_addr align 2