Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -883,6 +883,10 @@ /// subtarget's specifications, or does not meet number of waves per execution /// unit requirement. unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + + void getPostRAMutations( + std::vector> &Mutations) + const override; }; } // end namespace llvm Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -524,3 +524,57 @@ return MaxNumVGPRs - getReservedNumVGPRs(MF); } + +struct MemOpClusterMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + SUnit *SUa = nullptr; + // Search for two consequent memory operations and link them + // to prevent scheduler from moving them apart. + // In DAG pre-process SUnits are in the original order of + // the instructions before scheduling. + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MI2 = *SU.getInstr(); + if (!MI2.mayLoad() && !MI2.mayStore()) { + SUa = nullptr; + continue; + } + if (!SUa) { + SUa = &SU; + continue; + } + + MachineInstr &MI1 = *SUa->getInstr(); + if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || + (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || + (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || + (TII->isDS(MI1) && TII->isDS(MI2))) { + SU.addPredBarrier(SUa); + + for (const SDep &SI : SU.Preds) { + if (SI.getSUnit() != SUa) + SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); + } + + if (&SU != &DAG->ExitSU) { + for (const SDep &SI : SUa->Succs) { + if (SI.getSUnit() != &SU) + SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); + } + } + } + + SUa = &SU; + } + } +}; + +void SISubtarget::getPostRAMutations( + std::vector> &Mutations) const { + Mutations.push_back(llvm::make_unique(&InstrInfo)); +} Index: llvm/trunk/test/CodeGen/AMDGPU/and.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/and.ll +++ llvm/trunk/test/CodeGen/AMDGPU/and.ll @@ -219,10 +219,10 @@ } ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64: +; SI: s_load_dwordx2 ; SI: s_load_dword [[A:s[0-9]+]] ; SI: s_load_dword [[B:s[0-9]+]] ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_lshl_b32 [[A]], [[A]], 1 ; SI: s_lshl_b32 [[B]], [[B]], 1 Index: llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -11,10 +11,10 @@ ; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; CI: v_ashrrev_i32_e32 -; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI-DAG: v_ashrrev_i32_e32 +; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_or_b32_e32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs Index: llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -5,19 +5,19 @@ ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; GCN: s_cbranch_vccnz ; GCN: one{{$}} -; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short ; GCN: s_endpgm ; GCN: two{{$}} -; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] ; GCN: buffer_store_short v[[B_F16]] ; GCN: s_endpgm define amdgpu_kernel void @br_cc_f16( Index: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll @@ -400,9 +400,9 @@ ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} ; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} +; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef @@ -452,15 +452,15 @@ ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 -; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4 -; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8 +; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4 +; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8 ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 -; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4 -; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8 +; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4 +; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8 ; GCN-NEXT: s_swappc_b64 ; GCN-NOT: [[SP]] @@ -487,8 +487,8 @@ ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 ; GCN-NOT: s_add_u32 [[SP]] -; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 -; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8 +; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 +; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8 ; GCN-NEXT: s_swappc_b64 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 Index: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -327,8 +327,8 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: s_add_u32 s32, s32, 0x400{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir +++ llvm/trunk/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir @@ -0,0 +1,31 @@ +# RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# GCN: FLAT_LOAD_DWORD +# GCN-NEXT: FLAT_LOAD_DWORD +# GCN: FLAT_STORE_DWORD +# GCN-NEXT: FLAT_STORE_DWORD + +--- +name: cluster_loads_post_ra +tracksRegLiveness: true +registers: +liveins: + - { reg: '%vgpr0' } +body: | + bb.0: + liveins: %vgpr0 + + %vgpr0_vgpr1 = IMPLICIT_DEF + %vgpr4_vgpr5 = IMPLICIT_DEF + %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %vgpr4 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %vgpr2 = IMPLICIT_DEF + %vgpr3 = IMPLICIT_DEF + %vgpr6 = IMPLICIT_DEF + %vgpr0 = V_ADD_I32_e32 16, %vgpr2, implicit-def %vcc, implicit %exec + %vgpr1 = V_ADDC_U32_e32 %vgpr3, killed %vgpr6, implicit-def dead %vcc, implicit %vcc, implicit %exec + FLAT_STORE_DWORD %vgpr2_vgpr3, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + FLAT_STORE_DWORD %vgpr0_vgpr1, killed %vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + S_ENDPGM + +... Index: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -12,15 +12,15 @@ declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}test_copysign_f16: -; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] -; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] Index: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -24,7 +24,8 @@ } ; FUNC-LABEL: {{^}}test_copysign_f64_f32: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} +; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] Index: llvm/trunk/test/CodeGen/AMDGPU/frame-index-amdgiz.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/frame-index-amdgiz.ll +++ llvm/trunk/test/CodeGen/AMDGPU/frame-index-amdgiz.ll @@ -12,8 +12,8 @@ define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 { entry: -; CHECK: s_load_dword s2, s[0:1], 0xb ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK: s_load_dword s2, s[0:1], 0xb ; CHECK: s_load_dword s0, s[0:1], 0xc ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 @@ -27,9 +27,9 @@ ; CHECK: s_lshl_b32 s0, s0, 2 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen ; CHECK: v_add_i32_e32 v0, vcc, s0, v0 -; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen ; CHECK: s_mov_b32 s7, 0xf000 ; CHECK: s_mov_b32 s6, -1 +; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen ; CHECK: s_waitcnt vmcnt(0) ; CHECK: buffer_store_dword v0, off, s[4:7], 0 ; CHECK: s_endpgm Index: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -421,11 +421,11 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] @@ -450,11 +450,11 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -95,8 +95,9 @@ } ; GCN-LABEL: {{^}}fmuladd_v2f16 +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] @@ -124,11 +125,11 @@ ; VI-FLUSH-NOT: v_and_b32 ; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]] -; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]] +; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] Index: llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll @@ -424,25 +424,25 @@ ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 +; GCN-HSA-DAG: flat_store_dwordx4 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/load-weird-sizes.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-weird-sizes.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -5,8 +5,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i24: -; SI: {{flat|buffer}}_load_ubyte -; SI: {{flat|buffer}}_load_ushort +; SI-DAG: {{flat|buffer}}_load_ubyte +; SI-DAG: {{flat|buffer}}_load_ushort ; SI: {{flat|buffer}}_store_dword define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { %1 = load i24, i24 addrspace(1)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,9 +10,9 @@ ; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -6,11 +6,11 @@ ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NOT: v_mov_b32 Index: llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll +++ llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -170,10 +170,10 @@ ; CI. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: -; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} -; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} ; GCN-NOHSA-NOT: v_add +; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -2,16 +2,14 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s ; FUNC-LABEL: {{^}}cluster_arg_loads: -; FIXME: Due to changes in the load clustering heuristics. We no longer -; cluster all argument loads together on SI. -; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe -; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 +; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 Index: llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll +++ llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll @@ -215,9 +215,9 @@ ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; GCN: v_cndmask_b32_e32 -; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] -; GCN: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] +; GCN-DAG: v_cndmask_b32_e32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 Index: llvm/trunk/test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/select.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/select.f16.ll @@ -8,9 +8,9 @@ ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] +; SI-DAG: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] @@ -39,9 +39,9 @@ ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] +; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]] @@ -68,9 +68,9 @@ ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] +; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] Index: llvm/trunk/test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shl.v2i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -12,10 +12,10 @@ ; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; CI: v_lshlrev_b32_e32 -; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 +; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_or_b32_e32 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs Index: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -63,11 +63,11 @@ ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: buffer_store_dword -; GFX9: global_store_dword -; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; GFX9-DAG: global_store_dword +; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 ; GFX9: s_barrier -; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; GFX9: global_store_dword +; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; GFX9-DAG: global_store_dword define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 @@ -256,11 +256,12 @@ ; CI: v_mov_b32 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} ; CI: v_add_i32 ; CI: v_add_i32 +; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} + ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll +++ llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll @@ -122,7 +122,7 @@ ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 +; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -473,8 +473,8 @@ ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample -;CHECK: v_cmp -;CHECK: store +;CHECK-DAG: v_cmp +;CHECK-DAG: store define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0