Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -570,9 +570,20 @@ } return true; - }] -> { + }]> { let PredicateCodeUsesOperands = 1; + + // The divergence predicate is irrelevant in GlobalISel, as we have + // proper register bank checks. We also force all VOP instruction + // operands to VGPR, so we should not need to check the constant bus + // restriction. + // + // FIXME: With unlucky SGPR operands, we could penalize code by + // blocking folding SGPR->VGPR copies later. + // FIXME: There's no register bank verifier + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; } let SubtargetPredicate = isGFX9Plus in { @@ -614,7 +625,7 @@ class ThreeOp_i32_Pats : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. (ThreeOpFrag i32:$src0, i32:$src1, i32:$src2), - (inst i32:$src0, i32:$src1, i32:$src2) + (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) >; def : ThreeOp_i32_Pats; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -21,16 +21,15 @@ ; GFX6: %7:vgpr_32, dead %12:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[S_ADD_U32_]], 0, implicit $exec ; GFX6: %8:vgpr_32, dead %11:sreg_64_xexec = V_ADD_I32_e64 [[S_ADD_U32_]], %7, 0, implicit $exec ; GFX6: %9:vgpr_32, dead %10:sreg_64_xexec = V_ADD_I32_e64 %8, [[COPY2]], 0, implicit $exec - ; GFX6: S_ENDPGM 0, implicit %9 + ; GFX6: S_ENDPGM 0, implicit %8, implicit %9 ; GFX9-LABEL: name: add_s32 ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[S_ADD_U32_]], 0, implicit $exec - ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_U32_]], [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX9: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_1]], [[COPY2]], 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_2]] + ; GFX9: [[V_ADD3_U32_:%[0-9]+]]:vgpr_32 = V_ADD3_U32 [[COPY2]], [[S_ADD_U32_]], [[S_ADD_U32_]], implicit $exec + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD3_U32_]], [[COPY2]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD3_U32_]], implicit [[V_ADD_U32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 @@ -50,7 +49,7 @@ ; add vv %9:vgpr(s32) = G_ADD %8, %2 - S_ENDPGM 0, implicit %9 + S_ENDPGM 0, implicit %8, implicit %9 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: add_s32_sgpr_sgpr_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8-LABEL: name: add_s32_sgpr_sgpr_sgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX8: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_ADD_U32_]], [[COPY2]], implicit-def $scc + ; GFX8: S_ENDPGM 0, implicit [[S_ADD_U32_1]] + ; GFX9-LABEL: name: add_s32_sgpr_sgpr_sgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX9: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_ADD_U32_]], [[COPY2]], implicit-def $scc + ; GFX9: S_ENDPGM 0, implicit [[S_ADD_U32_1]] + ; GFX10-LABEL: name: add_s32_sgpr_sgpr_sgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX10: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_ADD_U32_]], [[COPY2]], implicit-def $scc + ; GFX10: S_ENDPGM 0, implicit [[S_ADD_U32_1]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_ADD %0, %1 + %4:sgpr(s32) = G_ADD %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: add_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: add_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 %3, [[COPY2]], 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4 + ; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD3_U32_:%[0-9]+]]:vgpr_32 = V_ADD3_U32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD3_U32_]] + ; GFX10-LABEL: name: add_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD3_U32_:%[0-9]+]]:vgpr_32 = V_ADD3_U32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD3_U32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_ADD %0, %1 + %4:vgpr(s32) = G_ADD %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: add_s32_vgpr_vgpr_vgpr_multi_use +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 %3, [[COPY2]], 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4, implicit %3 + ; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_]] + ; GFX10-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_ADD %0, %1 + %4:vgpr(s32) = G_ADD %3, %2 + S_ENDPGM 0, implicit %4, implicit %3 +... + Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: or_s32_sgpr_sgpr_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8-LABEL: name: or_s32_sgpr_sgpr_sgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX8: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_OR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_1]] + ; GFX9-LABEL: name: or_s32_sgpr_sgpr_sgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX9: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX9: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_OR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX9: S_ENDPGM 0, implicit [[S_OR_B32_1]] + ; GFX10-LABEL: name: or_s32_sgpr_sgpr_sgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX10: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX10: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_OR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX10: S_ENDPGM 0, implicit [[S_OR_B32_1]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_OR %0, %1 + %4:sgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: or_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: or_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]] + ; GFX9-LABEL: name: or_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_OR3_B32_:%[0-9]+]]:vgpr_32 = V_OR3_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_OR3_B32_]] + ; GFX10-LABEL: name: or_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_OR3_B32_:%[0-9]+]]:vgpr_32 = V_OR3_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_OR3_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_OR %0, %1 + %4:vgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: or_s32_vgpr_vgpr_vgpr_multi_use +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: or_s32_vgpr_vgpr_vgpr_multi_use + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]], implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: or_s32_vgpr_vgpr_vgpr_multi_use + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX9: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]], implicit [[V_OR_B32_e64_]] + ; GFX10-LABEL: name: or_s32_vgpr_vgpr_vgpr_multi_use + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX10: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]], implicit [[V_OR_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_OR %0, %1 + %4:vgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4, implicit %3 +... + Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.mir @@ -0,0 +1,188 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: xor_s32_sgpr_sgpr_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8-LABEL: name: xor_s32_sgpr_sgpr_sgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX8: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX8: S_ENDPGM 0, implicit [[S_XOR_B32_1]] + ; GFX9-LABEL: name: xor_s32_sgpr_sgpr_sgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX9: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX9: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX9: S_ENDPGM 0, implicit [[S_XOR_B32_1]] + ; GFX10-LABEL: name: xor_s32_sgpr_sgpr_sgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX10: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX10: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], [[COPY2]], implicit-def dead $scc + ; GFX10: S_ENDPGM 0, implicit [[S_XOR_B32_1]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_XOR %0, %1 + %4:sgpr(s32) = G_XOR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: xor_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: xor_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_XOR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[V_XOR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_XOR_B32_e64_1]] + ; GFX9-LABEL: name: xor_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX9: [[V_XOR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[V_XOR_B32_e64_]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_XOR_B32_e64_1]] + ; GFX10-LABEL: name: xor_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_XOR3_B32_:%[0-9]+]]:vgpr_32 = V_XOR3_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_XOR3_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_XOR %0, %1 + %4:vgpr(s32) = G_XOR %3, %2 + S_ENDPGM 0, implicit %4 +... + +# Mixed SGPR and VGPR, with full copy from scalar xor to VGPR, as +#should actually be produced by RegBankSelect + +--- + +name: xor_s32_sgpr_sgpr_vgpr_copy +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + + ; GFX8-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy + ; GFX8: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX8: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX8: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX8: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX9-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy + ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX9: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX9: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX10-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy + ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX10: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX10: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $vgpr0 + %3:sgpr(s32) = G_XOR %0, %1 + %4:vgpr(s32) = COPY %3 + %5:vgpr(s32) = G_XOR %4, %2 + S_ENDPGM 0, implicit %5 +... + +--- + +name: xor_s32_sgpr_sgpr_vgpr_copy_commute +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + + ; GFX8-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy_commute + ; GFX8: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX8: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX8: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX8: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY2]], [[COPY3]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX9-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy_commute + ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX9: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX9: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY2]], [[COPY3]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + ; GFX10-LABEL: name: xor_s32_sgpr_sgpr_vgpr_copy_commute + ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $vgpr0 + ; GFX10: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GFX10: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[COPY2]], [[COPY3]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_XOR_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $vgpr0 + %3:sgpr(s32) = G_XOR %0, %1 + %4:vgpr(s32) = COPY %3 + %5:vgpr(s32) = G_XOR %2, %4 + S_ENDPGM 0, implicit %5 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.xfail.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-xor3.xfail.mir @@ -0,0 +1,22 @@ +# RUN: not llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +# ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** + +--- + +name: xor_s32_sgpr_sgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $vgpr0 + %3:sgpr(s32) = G_XOR %0, %1 + %4:vgpr(s32) = G_XOR %3, %2 + S_ENDPGM 0, implicit %4 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -428,19 +428,18 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 42 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v5 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -497,19 +496,18 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 42 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v5 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -1012,26 +1010,25 @@ ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, 8, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_add3_u32 v4, v2, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm @@ -1089,23 +1086,22 @@ ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, 8, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1351,19 +1347,18 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 42 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v5 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id @@ -1420,19 +1415,18 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 42 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v5 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id @@ -1754,26 +1748,25 @@ ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, 8, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_add3_u32 v4, v2, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm @@ -1831,23 +1824,22 @@ ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, 8, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -30,13 +30,12 @@ ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -30,13 +30,12 @@ ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -64,18 +64,17 @@ ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_mul_lo_u32 v2, 0, v0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mul_lo_u32 v2, 0, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, 8, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, 8, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, s0, v0 +; GFX10-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0)