Index: llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -168,7 +168,9 @@ } auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, - OrigMI.getDebugLoc(), TII->get(DPPOp)); + OrigMI.getDebugLoc(), TII->get(DPPOp)) + .setMIFlags(OrigMI.getFlags()); + bool Fail = false; do { auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3260,7 +3260,8 @@ unsigned Op32) const { MachineBasicBlock *MBB = MI.getParent();; MachineInstrBuilder Inst32 = - BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) + .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. Index: llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -922,18 +922,24 @@ if (I->modifiesRegister(AMDGPU::VCC, TRI)) return; } + // Make the two new e32 instruction variants. // Replace MI with V_{SUB|ADD}_I32_e32 - auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) + .setMIFlags(MI.getFlags()); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 - auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) + .setMIFlags(MISucc.getFlags()); + MISucc.eraseFromParent(); } @@ -1010,7 +1016,8 @@ // Create SDWA version of instruction MI and initialize its operands MachineInstrBuilder SDWAInst = - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) + .setMIFlags(MI.getFlags()); // Copy dst, if it is present in original then should also be present in SDWA MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); Index: llvm/test/CodeGen/AMDGPU/dpp_combine.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -293,6 +293,8 @@ %19:vgpr_32 = V_ADD_I32_e32 5, %18, implicit-def $vcc, implicit $exec ... +--- + # check for floating point modifiers # GCN-LABEL: name: add_f32_e64 # GCN: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec @@ -810,3 +812,24 @@ %4:sreg_64_xexec = IMPLICIT_DEF %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec ... + +--- + +# Make sure flags aren't dropped +# GCN-LABEL: name: flags_add_f32_e64 +# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec +name: flags_add_f32_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 + +... Index: llvm/test/CodeGen/AMDGPU/sdwa-ops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-ops.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-ops.mir @@ -3,8 +3,8 @@ # test for 3 consecutive _sdwa's # GFX9-LABEL: name: test1_add_co_sdwa -# GFX9: V_ADD_I32_sdwa -# GFX9-NEXT: V_ADDC_U32_e32 +# GFX9: = nsw V_ADD_I32_sdwa +# GFX9-NEXT: = nuw V_ADDC_U32_e32 # GFX9: V_ADD_I32_sdwa # GFX9-NEXT: V_ADDC_U32_e32 # GFX9: V_ADD_I32_sdwa @@ -26,8 +26,8 @@ %22:sreg_32_xm0 = S_MOV_B32 255 %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec %30:vreg_64 = COPY $sgpr0_sgpr1 - %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, 0, implicit $exec - %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec + %63:vgpr_32, %65:sreg_64_xexec = nsw V_ADD_I32_e64 %30.sub0, %23, 0, implicit $exec + %64:vgpr_32, dead %66:sreg_64_xexec = nuw V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -361,7 +361,6 @@ # GFX9: $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def $exec, implicit $mode, implicit $exec - name: vopc_instructions tracksRegLiveness: true registers: @@ -445,3 +444,27 @@ FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 +... + +# GCN-LABEL: name: preserve_flags +# GCN: = nnan nofpexcept V_ADD_F32_sdwa 0, %4, 0, %4, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec + +--- +name: preserve_flags +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 65535 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec + %3:vgpr_32 = V_AND_B32_e32 %1, %2, implicit $exec + %4:vgpr_32 = V_LSHLREV_B32_e64 16, %3, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec + %6:vgpr_32 = V_BFE_U32 %4, 8, 8, implicit $exec + %7:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 %5, %6, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %7 + +... Index: llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir @@ -0,0 +1,24 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -run-pass=si-shrink-instructions %s -o - | FileCheck %s + +# Make sure flags are preserved when shrinking instructions +--- + +name: shrink_fadd_f32_flags +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: shrink_fadd_f32_flags + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec + ; CHECK: S_NOP 0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec + S_NOP 0 + +...