Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -335,21 +335,13 @@ bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); - // We are running before TwoAddressInstructions, and si_else's operands are - // tied. In order to correctly tie the registers, split this into a copy of - // the src like it does. - Register CopyReg = MRI->createVirtualRegister(BoolRC); - MachineInstr *CopyExec = - BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) - .add(MI.getOperand(1)); // Saved EXEC - // This must be inserted before phis and any spill code inserted before the // else. Register SaveReg = ExecModified ? MRI->createVirtualRegister(BoolRC) : DstReg; MachineInstr *OrSaveExec = BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) - .addReg(CopyReg); + .add(MI.getOperand(1)); // Saved EXEC MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); @@ -386,16 +378,13 @@ LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); - LIS->InsertMachineInstrInMaps(*CopyExec); LIS->InsertMachineInstrInMaps(*OrSaveExec); LIS->InsertMachineInstrInMaps(*Xor); LIS->InsertMachineInstrInMaps(*Branch); - // src reg is tied to dst reg. LIS->removeInterval(DstReg); LIS->createAndComputeVirtRegInterval(DstReg); - LIS->createAndComputeVirtRegInterval(CopyReg); if (ExecModified) LIS->createAndComputeVirtRegInterval(SaveReg); Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -484,21 +484,20 @@ ; GCN: S_BRANCH %bb.2 ; GCN: bb.2: ; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000) - ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_XOR_B64_]] - ; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[COPY1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GCN: bb.3: ; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], undef %4:sreg_64, implicit-def dead $scc + ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] ; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN: bb.6: ; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc ; GCN: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -198,23 +198,23 @@ ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 -; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} +; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; Regular spill value restored after exec modification ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Spill saved exec -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] +; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] +; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], 1 +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]], 0 +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]], 1 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} +; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}} ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] Index: llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir +++ llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -100,13 +100,12 @@ ; CHECK: bb.0: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY %2 - ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 [[COPY]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY1]], implicit $exec + ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY2]], implicit $exec + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK: S_BRANCH %bb.2 ; CHECK: bb.1: Index: llvm/test/CodeGen/PowerPC/fma-combine.ll =================================================================== --- llvm/test/CodeGen/PowerPC/fma-combine.ll +++ llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -239,4 +239,26 @@ %fma1 = call reassoc nsz double @llvm.fma.f64(double %fma, double %y, double %add) ret double %fma1 } + +define double @fma_flag_propagation(double %a) { +; CHECK-FAST-LABEL: fma_flag_propagation: +; CHECK-FAST: # %bb.0: # %entry +; CHECK-FAST-NEXT: xssubdp 1, 1, 1 +; CHECK-FAST-NEXT: blr +; +; CHECK-FAST-NOVSX-LABEL: fma_flag_propagation: +; CHECK-FAST-NOVSX: # %bb.0: # %entry +; CHECK-FAST-NOVSX-NEXT: fsub 1, 1, 1 +; CHECK-FAST-NOVSX-NEXT: blr +; +; CHECK-LABEL: fma_flag_propagation: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xssubdp 1, 1, 1 +; CHECK-NEXT: blr +entry: + %0 = fneg double %a + %1 = call reassoc nnan double @llvm.fma.f64(double %0, double 1.0, double %a) + ret double %1 +} + declare double @llvm.fma.f64(double, double, double) nounwind readnone