Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -202,6 +202,7 @@ GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } bool addPreISel() override; + void addMachineSSAOptimization() override; bool addInstSelector() override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; @@ -319,11 +320,24 @@ return false; } +void GCNPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + + // We want to fold operands after PeepholeOptimizer has run (or as part of + // it), because it will eliminate extra copies making it easier to fold the + // real source operand. We want to eliminate dead instructions after, so that + // we see fewer uses of the copies. We then need to clean up the dead + // instructions leftover after the operands are folded as well. + // + // XXX - Can we get away without running DeadMachineInstructionElim again? + addPass(&SIFoldOperandsID); + addPass(&DeadMachineInstructionElimID); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); - addPass(createSIFoldOperandsPass()); return false; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1058,6 +1058,8 @@ MI.RemoveOperand(Src0ModIdx); } +// TODO: Maybe this should be removed this and custom fold everything in +// SIFoldOperands? bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) @@ -1073,6 +1075,14 @@ return false; } + const MachineOperand &ImmOp = DefMI->getOperand(1); + + // If this is a free constant, there's no reason to do this. + // TODO: We could fold this here instead of letting SIFoldOperands do it + // later. + if (isInlineConstant(ImmOp, 4)) + return false; + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); Index: test/CodeGen/AMDGPU/commute_modifiers.ll =================================================================== --- test/CodeGen/AMDGPU/commute_modifiers.ll +++ test/CodeGen/AMDGPU/commute_modifiers.ll @@ -159,7 +159,7 @@ ; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]| +; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]| ; SI: buffer_store_dword [[RESULT]] define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/fma.ll =================================================================== --- test/CodeGen/AMDGPU/fma.ll +++ test/CodeGen/AMDGPU/fma.ll @@ -61,7 +61,7 @@ } ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 -; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}} define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/fmed3.ll +++ test/CodeGen/AMDGPU/fmed3.ll @@ -96,8 +96,8 @@ } ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: -; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, 2.0, {{v\[[0-9]+:[0-9]+\]}} -; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, 4.0, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0 +; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0 define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.r600.read.tidig.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid Index: test/CodeGen/AMDGPU/fneg.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f64.ll +++ test/CodeGen/AMDGPU/fneg.f64.ll @@ -39,7 +39,7 @@ ; unless the target returns true for isNegFree() ; FUNC-LABEL: {{^}}fneg_free_f64: -; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}} +; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fsub = fsub double 0.0, %bc Index: test/CodeGen/AMDGPU/fsub64.ll =================================================================== --- test/CodeGen/AMDGPU/fsub64.ll +++ test/CodeGen/AMDGPU/fsub64.ll @@ -47,7 +47,7 @@ } ; SI-LABEL: {{^}}s_fsub_imm_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0 define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double 4.0, %a store double %sub, double addrspace(1)* %out @@ -55,7 +55,7 @@ } ; SI-LABEL: {{^}}s_fsub_imm_inv_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0 define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, 4.0 store double %sub, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -322,7 +322,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}} ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.0 @@ -333,7 +333,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.5 @@ -344,7 +344,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -0.5 @@ -355,7 +355,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 1.0 @@ -366,7 +366,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -1.0 @@ -377,7 +377,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 2.0 @@ -388,7 +388,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -2.0 @@ -399,7 +399,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 4.0 @@ -410,7 +410,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -4.0 @@ -422,7 +422,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_1_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}} ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000001 @@ -433,7 +433,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_2_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}} ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000002 @@ -444,7 +444,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_16_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000010 @@ -455,7 +455,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xffffffffffffffff @@ -466,7 +466,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffffe @@ -477,7 +477,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffff0 @@ -488,7 +488,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_63_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x000000000000003F @@ -499,7 +499,7 @@ ; CHECK-LABEL: {{^}}add_inline_imm_64_f64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]] +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64 ; CHECK: buffer_store_dwordx2 [[REG]] define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000040 Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s ; FIXME: Enable for VI. @@ -45,7 +45,7 @@ ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { @@ -146,7 +146,7 @@ ; SI: BB9_2: ; SI: s_or_b64 exec, exec, [[SAVE]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v0 +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword ; SI: s_endpgm Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -1,13 +1,17 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + ; FIXME: None of these trigger madmk emission anymore. It is still + ; possible, but requires the correct registers to be used which is + ; hard to trigger. + declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madmk_f32: ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], 0x41200000, [[VB]] +; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -182,7 +186,7 @@ ; SI-LABEL: {{^}}kill_madmk_verifier_error: ; SI: s_xor_b64 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} +; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} ; SI: s_or_b64 define void @kill_madmk_verifier_error() nounwind { bb: Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -96,8 +96,8 @@ } ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: -; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} -; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} +; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; SI: s_endpgm define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll =================================================================== --- test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -3,7 +3,7 @@ ; register operands in the correct order when modifying the opcode of an ; instruction to V_ADD_I32_e32. -; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 killed %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: Index: test/CodeGen/AMDGPU/sint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -10,14 +10,14 @@ ret void } -; SI-LABEL: {{^}}sint_to_fp_i1_f64: -; SI: v_cmp_eq_i32_e64 vcc, ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already ; uses an SGPR (implicit vcc). + +; SI-LABEL: {{^}}sint_to_fp_i1_f64: +; SI-DAG: v_cmp_eq_i32_e64 vcc, ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} - ; SI: s_endpgm define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -230,9 +230,8 @@ ; GCN-LABEL: {{^}}s_ashr_63_i64: ; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 -; GCN-DAG: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] -; GCN-DAG: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} -; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}} +; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 %add = add i64 %result, %b Index: test/CodeGen/AMDGPU/uint_to_fp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -70,10 +70,11 @@ ret void } -; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: -; SI: v_cmp_eq_i32_e64 vcc ; We can't fold the SGPRs into v_cndmask_b32_e32, because it already ; uses an SGPR (implicit vcc). + +; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: +; SI-DAG: v_cmp_eq_i32_e64 vcc ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s declare float @llvm.fma.f32(float, float, float) #1 @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: ; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 @@ -227,7 +227,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]] ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]] @@ -254,7 +254,7 @@ ; Same zero component is re-used for half of each immediate. ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 -; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}} +; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] Index: test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac.ll +++ test/CodeGen/AMDGPU/v_mac.ll @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}mad_inline_sgpr_inline: ; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5 +; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) { entry: %tmp0 = fmul float 0.5, %in