Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,6 +104,9 @@ new ScheduleDAGMILive(C, make_unique(C)); const SIInstrInfo *TII = static_cast(DAG->TII); + + DAG->addMutation(createCopyConstrainDAGMutation(TII, DAG->TRI)); + if (TII->enableClusterLoads()) DAG->addMutation(createLoadClusterDAGMutation(TII, DAG->TRI)); Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -11,10 +11,10 @@ ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] ; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] +; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]] define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: @@ -29,8 +29,8 @@ ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] @@ -47,8 +47,8 @@ ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] @@ -66,8 +66,8 @@ ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] @@ -84,8 +84,8 @@ ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] @@ -102,8 +102,8 @@ ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] +; HSA-VI-DAG: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -96,10 +96,10 @@ ; SI-NOT: or_b32 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63 ; SI-NOT: or_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] -; SI-NOT: or_b32 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI-NOT: or_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, 63 Index: test/CodeGen/AMDGPU/sad.ll =================================================================== --- test/CodeGen/AMDGPU/sad.ll +++ test/CodeGen/AMDGPU/sad.ll @@ -134,8 +134,8 @@ ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b Index: test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- test/CodeGen/AMDGPU/select-vectors.ll +++ test/CodeGen/AMDGPU/select-vectors.ll @@ -93,13 +93,13 @@ ; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] ; SI-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] ; SI: v_cndmask_b32_e32 -; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { Index: test/CodeGen/AMDGPU/trunc.ll =================================================================== --- test/CodeGen/AMDGPU/trunc.ll +++ test/CodeGen/AMDGPU/trunc.ll @@ -35,11 +35,12 @@ ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], -; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] ; SI: s_addc_u32 -; SI: buffer_store_dword v[[LO_VREG]], -; SI: v_mov_b32_e32 +; SI: v_mov_b32_e32 v[[LO_VREG0:[0-9]+]], s[[LO_SREG2]] +; SI: v_mov_b32_e32 v[[LO_VREG1:[0-9]+]], s[[LO_SREG2]] ; SI: v_mov_b32_e32 +; SI: buffer_store_dword v[[LO_VREG1]], +; SI: buffer_store_dwordx2 v{{\[}}[[LO_VREG0]]: define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -206,10 +206,11 @@ ; SI-NOT: xor_b32 ; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63 ; SI-NOT: xor_b32 -; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] -; SI-NOT: xor_b32 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: xor_b32 +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] + +; SI-NOT: xor_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, 63