Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -755,6 +755,45 @@ } }; +struct FixBundleLatencyMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + const TargetSchedModel *TSchedModel; + + FixBundleLatencyMutation(const SIInstrInfo *tii) : TII(tii) {} + + unsigned computeLatency(const MachineInstr &MI, unsigned Reg) const { + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineBasicBlock::const_instr_iterator I(MI.getIterator()); + MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); + unsigned Lat = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) { + if (!I->modifiesRegister(Reg, &TRI)) + continue; + Lat = TSchedModel->computeInstrLatency(&*I); + break; + } + return Lat; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + for (SUnit &SU : DAG->SUnits) { + if (!SU.isInstr() || !SU.getInstr()->isBundle()) + continue; + for (SDep &Dep : SU.Succs) { + if (Dep.getKind() == SDep::Kind::Data && Dep.getReg()) + if (unsigned Lat = computeLatency(*SU.getInstr(), Dep.getReg())) + Dep.setLatency(Lat); + } + } + } +}; + struct FillMFMAShadowMutation : ScheduleDAGMutation { const SIInstrInfo *TII; @@ -881,6 +920,7 @@ void GCNSubtarget::getPostRAMutations( std::vector> &Mutations) const { + Mutations.push_back(std::make_unique(&InstrInfo)); Mutations.push_back(std::make_unique(&InstrInfo)); Mutations.push_back(std::make_unique(&InstrInfo)); } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1042,6 +1042,10 @@ int FrameIndex, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; + + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr &MI, + unsigned *PredCost) const override; }; /// \brief Returns true if a reg:subreg pair P has a TRC class Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6625,3 +6625,20 @@ return nullptr; } + +unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr &MI, + unsigned *PredCost) const { + if (MI.isBundle()) { + MachineBasicBlock::const_instr_iterator I(MI.getIterator()); + MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); + unsigned Lat = 0, Count = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) { + ++Count; + Lat = std::max(Lat, getInstrLatency(ItinData, *I, PredCost)); + } + return Lat + Count - 1; + } + + return AMDGPUGenInstrInfo::getInstrLatency(ItinData, MI, PredCost); +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s ; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; Minimum offset ; GCN-LABEL: {{^}}gws_init_offset0: Index: llvm/test/CodeGen/AMDGPU/min.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/min.ll +++ llvm/test/CodeGen/AMDGPU/min.ll @@ -404,7 +404,7 @@ ; FUNC-LABEL: @v_test_umin_ult_i16_multi_use ; GCN-NOT: v_min ; GCN: v_cmp_lt_u32 -; GCN-NEXT: v_cndmask_b32 +; GCN: v_cndmask_b32 ; GCN-NOT: v_min ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/misched-killflags.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/misched-killflags.mir +++ llvm/test/CodeGen/AMDGPU/misched-killflags.mir @@ -35,12 +35,12 @@ # CHECK-DAG: $sgpr8 = S_MOV_B32 3 # CHECK-DAG: $sgpr33 = S_MOV_B32 $sgpr7 # CHECK: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr8_sgpr9_sgpr10_sgpr11 -# CHECK: $sgpr32 = S_MOV_B32 $sgpr33 # CHECK: BUNDLE implicit-def $sgpr6_sgpr7, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $scc { # CHECK: $sgpr6_sgpr7 = S_GETPC_B64 # CHECK: $sgpr6 = S_ADD_U32 internal $sgpr6, 0, implicit-def $scc # CHECK: $sgpr7 = S_ADDC_U32 internal $sgpr7, 0, implicit-def $scc, implicit internal $scc # CHECK: } +# CHECK: $sgpr32 = S_MOV_B32 $sgpr33 # CHECK: $sgpr4 = S_MOV_B32 killed $sgpr33 # CHECK: $vgpr1 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 # CHECK: $vgpr2 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -199,10 +199,10 @@ ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s34 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v35, s36, 0 -; GFX9-NEXT: v_writelane_b32 v35, s37, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4 +; GFX9-NEXT: v_writelane_b32 v35, s37, 1 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v32, v1 ; GFX9-NEXT: v_mov_b32_e32 v33, v0 Index: llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -205,7 +205,7 @@ ; GCN-NOT: and ; GCN-NOT: lshr ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], -; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] +; GCN: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; GCN-NEXT: buffer_store_dword v[[HI]] define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: Index: llvm/test/CodeGen/AMDGPU/packed-op-sel.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -204,8 +204,8 @@ ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]] ; FIXME: Remove and -; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]] -; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]] +; GCN-DAG: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]] +; GCN-DAG: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]] ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]] ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}} @@ -660,7 +660,7 @@ ; GCN-NOT: _or ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] -; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { bb: %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -22,9 +22,9 @@ ; GFX9-DAG: s_mov_b32 s7, 0xe00000 ; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000 ; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000 -; GCN-NOT: s_mov_b32 s0 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] +; GCN-NOT: s_mov_b32 s0 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] Index: llvm/test/CodeGen/AMDGPU/selectcc-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/selectcc-opt.ll +++ llvm/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -69,7 +69,7 @@ ; FUNC-LABEL: {{^}}selectcc_bool: ; SI: v_cmp_ne_u32 -; SI-NEXT: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 ; SI-NOT: cmp ; SI-NOT: cndmask define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { Index: llvm/test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT:buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm @@ -22,7 +22,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm @@ -39,7 +39,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1: ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -53,7 +53,7 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1: ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -67,7 +67,7 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -81,7 +81,7 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -95,7 +95,7 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -109,7 +109,7 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_u32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b Index: llvm/test/CodeGen/AMDGPU/sint_to_fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sint_to_fp.ll +++ llvm/test/CodeGen/AMDGPU/sint_to_fp.ll @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32: ; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { Index: llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}max_10_vgprs_used_9a: ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; GFX908: buffer_store_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ ; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 Index: llvm/test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sub.i16.ll +++ llvm/test/CodeGen/AMDGPU/sub.i16.ll @@ -108,7 +108,7 @@ ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 +; VI: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -129,7 +129,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 -; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/uint_to_fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uint_to_fp.ll +++ llvm/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -78,7 +78,7 @@ ; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32: ; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -491,10 +491,10 @@ } ; GCN-LABEL: {{^}}test_br_cc_f16: -; GFX1032: v_cmp_nlt_f16_e32 vcc_lo, -; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX1064: v_cmp_nlt_f16_e32 vcc, -; GFX1064-NEXT: s_and_b64 vcc, exec, vcc{{$}} +; GFX1032: v_cmp_nlt_f16_e32 vcc_lo, +; GFX1032: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX1064: v_cmp_nlt_f16_e32 vcc, +; GFX1064: s_and_b64 vcc, exec, vcc{{$}} ; GCN-NEXT: s_cbranch_vccnz define amdgpu_kernel void @test_br_cc_f16( half addrspace(1)* %r, Index: llvm/test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -36,9 +36,9 @@ } ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64: -; GCN: s_mov_b32 s{{[0-9]+}}, 0 -; GCN: v_cmp_eq_u32 -; GCN: v_cndmask_b32 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0 +; GCN-DAG: v_cmp_eq_u32 +; GCN: v_cndmask_b32 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64