Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -382,17 +382,32 @@ FlatUsed = true; continue; + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("Trap Handler registers should not be used"); + continue; + default: break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { + if (AMDGPU::TTMP_32RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + if (AMDGPU::TTMP_64RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 2; } else if (AMDGPU::VReg_64RegClass.contains(reg)) { Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -551,8 +551,10 @@ } -static int getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { +enum RegisterKind { IS_VGPR, IS_SGPR, IS_TTMP }; + +static int getRegClass(RegisterKind Is, unsigned RegWidth) { + if (Is == IS_VGPR) { switch (RegWidth) { default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; @@ -562,16 +564,23 @@ case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; } + } else if (Is == IS_TTMP) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::TTMP_32RegClassID; + case 2: return AMDGPU::TTMP_64RegClassID; + } + } else if (Is == IS_SGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } } - - switch (RegWidth) { - default: return -1; - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } + return -1; } static unsigned getRegForName(StringRef RegName) { @@ -588,6 +597,10 @@ .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) .Default(0); } @@ -605,21 +618,21 @@ return !subtargetHasRegister(*TRI, RegNo); } - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') + // Match vgprs, sgprs and ttmps + if (RegName[0] != 's' && RegName[0] != 'v' && !RegName.startswith("ttmp")) return true; - bool IsVgpr = RegName[0] == 'v'; + const RegisterKind Is = RegName[0] == 'v' ? IS_VGPR : RegName[0] == 's' ? IS_SGPR : IS_TTMP; unsigned RegWidth; unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register + if (RegName.size() > (Is == IS_TTMP ? strlen("ttmp") : 1) ) { + // We have a single 32-bit register. Syntax: vXX RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + if (RegName.substr(Is == IS_TTMP ? strlen("ttmp") : 1).getAsInteger(10, RegIndexInClass)) return true; Parser.Lex(); } else { - // We have a register greater than 32-bits. + // We have a register greater than 32-bits (a range of single registers). Syntax: v[XX:YY] int64_t RegLo, RegHi; Parser.Lex(); @@ -642,11 +655,11 @@ Parser.Lex(); RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { + if (Is == IS_VGPR) { // VGPR registers aren't aligned. RegIndexInClass = RegLo; } else { - // SGPR registers are aligned. Max alignment is 4 dwords. + // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords. unsigned Size = std::min(RegWidth, 4u); if (RegLo % Size != 0) return true; @@ -655,7 +668,7 @@ } } - int RCID = getRegClass(IsVgpr, RegWidth); + int RCID = getRegClass(Is, RegWidth); if (RCID == -1) return true; Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -18,6 +18,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include + using namespace llvm; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, @@ -179,6 +181,18 @@ case AMDGPU::VCC_HI: O << "vcc_hi"; return; + case AMDGPU::TBA_LO: + O << "tba_lo"; + return; + case AMDGPU::TBA_HI: + O << "tba_hi"; + return; + case AMDGPU::TMA_LO: + O << "tma_lo"; + return; + case AMDGPU::TMA_HI: + O << "tma_hi"; + return; case AMDGPU::EXEC_LO: O << "exec_lo"; return; @@ -195,41 +209,44 @@ break; } - char Type; + std::string Type; unsigned NumRegs; if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { - Type = 's'; + Type = "s"; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) { + Type = "s"; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { - Type = 's'; + } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) { + Type = "ttmp"; NumRegs = 2; } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; NumRegs = 4; } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { - Type = 's'; + Type = "s"; NumRegs = 4; } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; NumRegs = 3; } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { - Type = 's'; + Type = "s"; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { - Type = 'v'; + Type = "v"; NumRegs = 16; } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { - Type = 's'; + Type = "s"; NumRegs = 16; } else { O << getRegisterName(reg); @@ -239,6 +256,8 @@ // The low 8 bits of the encoding value is the register index, for both VGPRs // and SGPRs. unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (Type == "ttmp") + RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. if (NumRegs == 1) { O << Type << RegIdx; return; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -90,6 +90,16 @@ reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // Reserve Trap Handler registers - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::TBA); + reserveRegisterTuples(Reserved, AMDGPU::TMA); + reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); + reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); + reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); + reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); + reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); + reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); + // Reserve the last 2 registers so we will always have at least 2 more that // will physically contain VCC. reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); @@ -573,7 +583,21 @@ switch(Channel) { case 0: return AMDGPU::VCC_LO; case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); + default: llvm_unreachable("Invalid SubIdx for VCC"); break; + } + + case AMDGPU::TBA: + switch(Channel) { + case 0: return AMDGPU::TBA_LO; + case 1: return AMDGPU::TBA_HI; + default: llvm_unreachable("Invalid SubIdx for TBA"); break; + } + + case AMDGPU::TMA: + switch(Channel) { + case 0: return AMDGPU::TMA_LO; + case 1: return AMDGPU::TMA_HI; + default: llvm_unreachable("Invalid SubIdx for TMA"); break; } case AMDGPU::FLAT_SCR: Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,40 @@ def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +// Trap handler registers +def TBA_LO : SIReg<"tba_lo", 108>; +def TBA_HI : SIReg<"tba_hi", 109>; + +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 108; +} + +def TMA_LO : SIReg<"tma_lo", 110>; +def TMA_HI : SIReg<"tma_hi", 111>; + +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 110; +} + +def TTMP0 : SIReg <"ttmp0", 112>; +def TTMP1 : SIReg <"ttmp1", 113>; +def TTMP2 : SIReg <"ttmp2", 114>; +def TTMP3 : SIReg <"ttmp3", 115>; +def TTMP4 : SIReg <"ttmp4", 116>; +def TTMP5 : SIReg <"ttmp5", 117>; +def TTMP6 : SIReg <"ttmp6", 118>; +def TTMP7 : SIReg <"ttmp7", 119>; +def TTMP8 : SIReg <"ttmp8", 120>; +def TTMP9 : SIReg <"ttmp9", 121>; +def TTMP10 : SIReg <"ttmp10", 122>; +def TTMP11 : SIReg <"ttmp11", 123>; + multiclass FLAT_SCR_LOHI_m ci_e, bits<16> vi_e> { def _ci : SIReg; def _vi : SIReg; @@ -135,6 +169,24 @@ (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// Trap handler TMP 32-bit registers +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "TTMP%u", 0, 11))> { + let isAllocatable = 0; +} + +// Trap handler TMP 64-bit registers +def TTMP_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate TTMP_32, 2)), + (add (decimate (shl TTMP_32, 1), 2))]>; + +// Trap handler TMP 128-bit registers +def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4))]>; + // VGPR 32-bit registers def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "VGPR%u", 0, 255))>; @@ -199,13 +251,18 @@ // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI) >; def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { + let isAllocatable = 0; +} + def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR) + (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64) >; def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -256,8 +256,8 @@ } ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} -; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} ; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]] @@ -483,8 +483,8 @@ ; low 32-bits, which is not a valid 64-bit inline immmediate. ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: -; SI: s_load_dwordx2 -; SI: s_load_dword s +; SI-DAG: s_load_dwordx2 +; SI-DAG: s_load_dword s ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -3,11 +3,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 @@ -23,10 +23,10 @@ ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -10,7 +10,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -35,8 +35,10 @@ ; FUNC-LABEL: {{^}}v_ctlz_i32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], +; +; FIXME: v_ffbh_u32 does not look correct for i32 code ; SI-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]] +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm @@ -99,7 +101,7 @@ ; FUNC-LABEL: {{^}}v_ctlz_i8: ; SI: buffer_load_ubyte [[VAL:v[0-9]+]], ; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]] +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]] ; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]] ; SI: buffer_store_byte [[RESULT]], @@ -137,14 +139,21 @@ ; FUNC-LABEL: {{^}}v_ctlz_i64: ; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] +; SI-DAG: v_cmp_eq_i32_e{{32|64}} [[CMPHI:s\[[0-9]+:[0-9]+\]|vcc]], 0, v[[HI]] +; +; NOTE: When CMPHI != vcc, src3 of the next instruction is not verified. +; NOTE: Reason: FileCheck does not support variables within regex. +; NOTE: Alternatively, VI prefix can be used for tonga, but that would +; NOTE: require duplication of almost all SI checks except this one or +; NOTE: moving this test to separate .ll file. Both look overkill. +; +; SI-DAG: v_cndmask_b32_e{{64|32}} v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]] ; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]] -; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc +; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ]], 64, vcc ; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -32,10 +32,11 @@ ; SI-LABEL: {{^}}load_v3i8_to_v3f32: ; SI-NOT: bfe ; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte0_e32 -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI-DAG: v_cvt_f32_ubyte2_e32 v[[WORD2:[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[WORD1:[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[WORD0:[0-9]+]] +; SI: buffer_store_dword v[[WORD2]], +; SI: buffer_store_dwordx2 v{{\[}}[[WORD0]]:[[WORD1]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -120,10 +120,10 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} {{offset0:1 offset1:3|offset0:3 offset1:2}}{{$}} ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:{{2|1}}{{$}} ; CI: v_mov_b32 ; CI: v_mov_b32 ; CI: buffer_store_dwordx4 @@ -140,14 +140,14 @@ ; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} {{offset0:1 offset1:3|offset0:3 offset1:5}}{{$}} ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}} -; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} {{offset0:5 offset1:7|offset0:7 offset1:2}}{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} {{offset0:6 offset1:4|offset0:6 offset1:4}}{{$}} ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}} +; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:{{2|1}}{{$}} ; CI: v_mov_b32 ; CI: v_mov_b32 Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -79,29 +79,6 @@ ret void } -; 2 data subregisters from different super registers. -; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 -; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 - %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 - %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 - %val0.0 = extractelement <2 x float> %val0, i32 0 - %val1.1 = extractelement <2 x float> %val1, i32 1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0.0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1.1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - ; SI-LABEL: @simple_write2_two_val_subreg2_f32 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -121,25 +98,6 @@ ret void } -; SI-LABEL: @simple_write2_two_val_subreg4_f32 -; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i - %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 - %val0 = extractelement <4 x float> %val, i32 0 - %val1 = extractelement <4 x float> %val, i32 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - ; SI-LABEL: @simple_write2_two_val_max_offset_f32 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 @@ -178,66 +136,6 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - ; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 ; SI-NOT: ds_write2_b32 ; SI: ds_write_b32 @@ -265,23 +163,6 @@ ret void } -; SI-LABEL: @simple_write2_one_val_f64 -; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm -define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 8 - ret void -} - ; SI-LABEL: @misaligned_simple_write2_one_val_f64 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/ds_write2_fixme.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/ds_write2_fixme.ll @@ -0,0 +1,144 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; XFAIL: * +; FIXME PR000 +; FIXME "000" to be updated just after the change is submitted to trunk and actual PR is created. +; FIXME After resolving the PR, please move contents of this file to ds_write2.ll. + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 + +; 2 data subregisters from different super registers. +; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 +; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 + %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 + %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 + %val0.0 = extractelement <2 x float> %val0, i32 0 + %val1.1 = extractelement <2 x float> %val1, i32 1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0.0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1.1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_subreg4_f32 +; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i + %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 + %val0 = extractelement <4 x float> %val, i32 0 + %val1 = extractelement <4 x float> %val, i32 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @simple_write2_one_val_f64 +; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; SI: s_endpgm +define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 8 + ret void +} + + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workgroup.id.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -43,7 +43,8 @@ ; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR_0:v[0-9]+]], 2, v{{[0-9]+}} +; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[VPTR_0]] ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; SI: s_endpgm define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { Index: test/CodeGen/AMDGPU/fcopysign.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fcopysign.f32.ll +++ test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -9,8 +9,8 @@ ; Try to identify arg based on higher address. ; FUNC-LABEL: {{^}}test_copysign_f32: -; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb -; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc +; SI-DAG: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb +; SI-DAG: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc ; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c ; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 ; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -208,10 +208,10 @@ ; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_mov_b32 m0, [[SCALEDIDX]] ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] Index: test/CodeGen/AMDGPU/llvm.SI.load.dword.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -7,11 +7,11 @@ ; FIXME: Out of bounds immediate offset crashes ; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding +; CHECK-DAG: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc +; CHECK-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc +; CHECK-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc +; CHECK-DAG: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc +; CHECK-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -12,7 +12,7 @@ ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 @@ -27,7 +27,7 @@ ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 @@ -43,7 +43,7 @@ ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fneg = fsub float -0.0, %a @@ -59,7 +59,7 @@ ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 @@ -74,7 +74,7 @@ ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 @@ -87,7 +87,7 @@ ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 @@ -102,7 +102,7 @@ ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 @@ -116,7 +116,7 @@ ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 @@ -188,7 +188,7 @@ ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 @@ -203,7 +203,7 @@ ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 @@ -219,7 +219,7 @@ ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fneg = fsub double -0.0, %a @@ -235,7 +235,7 @@ ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 @@ -273,7 +273,7 @@ ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] ; SI-NOT: vcc ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 Index: test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.f64.ll +++ test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -12,8 +12,8 @@ ; instructions that are necessary. ; FUNC-LABEL: {{^}}v_round_f64: -; SI: buffer_load_dwordx2 -; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 +; SI-DAG: buffer_load_dwordx2 +; SI-DAG: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 ; SI-DAG: v_not_b32_e32 ; SI-DAG: v_not_b32_e32 Index: test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics.ll +++ test/CodeGen/AMDGPU/local-atomics.ll @@ -5,9 +5,9 @@ ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: ; EG: LDS_WRXCHG_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -31,9 +31,9 @@ ; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: ; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -306,8 +306,8 @@ ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: ; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: s_endpgm define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -327,8 +327,8 @@ ; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: ; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_u32 [[VPTR]], [[DATA]] ; GCN: s_endpgm define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { Index: test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics64.ll +++ test/CodeGen/AMDGPU/local-atomics64.ll @@ -30,9 +30,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 @@ -277,9 +277,9 @@ ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset: ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: s_endpgm define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -7,8 +7,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN-DAG: s_load_dwordx2 [[SA_LO:s\[[0-9]+:]]{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; GCN-DAG: s_load_dwordx2 [[SB_LO:s\[[0-9]+:]]{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, [[SA_LO]]{{[0-9]+\]}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, [[SB_LO]]{{[0-9]+\]}} ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -81,8 +83,10 @@ ; an inline immediate. ; GCN-LABEL: {{^}}madak_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, s[4:7] +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, s[8:11] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/max.ll =================================================================== --- test/CodeGen/AMDGPU/max.ll +++ test/CodeGen/AMDGPU/max.ll @@ -205,7 +205,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI-NEXT: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 @@ -223,7 +223,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] +; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI-NEXT: buffer_store_dword [[VMAX]] define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -301,7 +301,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; SI-NEXT: buffer_store_dword [[VMIN]] define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 @@ -319,7 +319,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; SI-NEXT: buffer_store_dword [[VMIN]] define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -6,8 +6,8 @@ ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: -; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -53,9 +53,11 @@ ; Test moving an SMRD instruction to the VALU ; GCN-LABEL: {{^}}smrd_valu: -; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 -; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} -; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; GCN-NOHSA: buffer_load_dwordx2 v{{\[}}[[VSRC_LO:[0-9]+]]:[[VSRC_HI:[0-9]+]]{{\]}} +; GCN-HSA: flat_load_dwordx2 v{{\[}}[[VSRC_LO:[0-9]+]]:[[VSRC_HI:[0-9]+]]{{\]}} +; SI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 +; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v[[VSRC_LO]] +; GCN-DAG: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v[[VSRC_HI]] ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] Index: test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -2,10 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s ; FUNC-LABEL: {{^}}cluster_arg_loads: -; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe +; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; VI-NEXT: s_nop 0 ; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -34,8 +34,8 @@ ; working. ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] ; SI: v_add_i32 ; SI: v_lshrrev_b32 Index: test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-opt.ll +++ test/CodeGen/AMDGPU/setcc-opt.ll @@ -6,7 +6,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT:buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W @@ -23,7 +23,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W @@ -42,7 +42,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc ; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} ; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -58,7 +58,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc ; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}} ; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -72,7 +72,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -86,7 +86,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -100,7 +100,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -114,7 +114,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_ne_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -128,7 +128,7 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b @@ -142,7 +142,8 @@ ; GCN-NOT: v_cmp ; GCN: v_cmp_eq_i32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 @@ -230,7 +231,7 @@ ; GCN: s_load_dword [[B:s[0-9]+]] ; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]] ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] -; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { %b.ext = sext i8 %b to i32 Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -93,121 +93,6 @@ ret void } -; FUNC-LABEL: @reorder_constant_load_local_store_constant_load -; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} -; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} -; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 -; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load -; CI: s_load_dword -; CI: s_load_dword -; CI: s_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_load_local_store_global_load -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_local_offsets -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 - - store i32 123, i32 addrspace(3)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 - store i32 123, i32 addrspace(3)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 789, i32 addrspace(3)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_offsets -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 - - store i32 123, i32 addrspace(1)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 - store i32 123, i32 addrspace(1)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 789, i32 addrspace(1)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - ; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 ; XCI: TBUFFER_STORE_FORMAT Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access_fixme.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access_fixme.ll @@ -0,0 +1,132 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; XFAIL: * +; FIXME PR000 +; FIXME "000" to be updated just after the change is submitted to trunk and actual PR is created. +; FIXME After resolving the PR, please move contents of this file to si-triv-disjoint-mem-access.ll. + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.s.barrier() #2 + +@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 +@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 +@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 + +; FUNC-LABEL: @reorder_constant_load_local_store_constant_load +; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load +; CI: s_load_dword +; CI: s_load_dword +; CI: s_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_load_local_store_global_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_local_offsets +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 + + store i32 123, i32 addrspace(3)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 + store i32 123, i32 addrspace(3)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 789, i32 addrspace(3)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 + + store i32 123, i32 addrspace(1)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 + store i32 123, i32 addrspace(1)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 789, i32 addrspace(1)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -19,13 +19,14 @@ ; GCN: v_ffbh_u32 ; GCN: v_ffbh_u32 ; GCN: v_cndmask -; GCN: v_cndmask +; GCN: v_and_b32_e32 v[[EQ_HI:[0-9]+]], 0xff, v{{[0-9]+}} +; GCN-DAG: v_cndmask -; GCN-DAG: v_cmp_eq_i64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_eq_i64_e{{32|64}} {{.*}}v[{{[0-9]+}}:[[EQ_HI]]] +; GCN-DAG: v_cmp_lt_u64_e64 {{.*}}v[{{[0-9]+}}:[[EQ_HI]]] ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]], +; GCN: v_cndmask_b32_e{{64|32}} [[SIGN_SEL:v[0-9]+]], ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]] define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -47,10 +47,10 @@ ; FUNC-LABEL: {{^}}v_abs_v2i32: ; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -98,14 +98,14 @@ ; FUNC-LABEL: {{^}}v_abs_v4i32: ; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] ; GCN: v_add_i32 ; GCN: v_add_i32 Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -231,7 +231,7 @@ ; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] -; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}} define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -6,10 +6,10 @@ ; CHECK-LABEL: foobar: ; CHECK: s_load_dword s2, s[0:1], 0x9 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v1 +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 [[VMBCNT:v[0-9]+]] +; CHECK-DAG: s_waitcnt lgkmcnt(0) +; CHECK-DAG: v_mov_b32_e32 v0, s2 +; CHECK-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VMBCNT]] ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: Index: test/CodeGen/AMDGPU/trunc.ll =================================================================== --- test/CodeGen/AMDGPU/trunc.ll +++ test/CodeGen/AMDGPU/trunc.ll @@ -35,11 +35,12 @@ ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], -; SI: s_addc_u32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] -; SI: buffer_store_dword v[[LO_VREG]], +; SI: s_addc_u32 s[[HI_SREG2:[0-9]+]] +; SI-DAG: v_mov_b32_e32 v[[LO_VREG2:[0-9]+]], s[[LO_SREG2]] +; SI-DAG: v_mov_b32_e32 v[[HI_VREG2:[0-9]+]], s[[HI_SREG2]] +; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] +; SI-DAG: buffer_store_dword v[[LO_VREG]], +; SI-DAG: buffer_store_dwordx2 v{{\[}}[[LO_VREG2]]:[[HI_VREG2]]{{\]}}, define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=SIVI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=SIVI --check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}test_udivrem: @@ -27,30 +27,38 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] -; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] -; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] -; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]] -; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] -; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] -; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm +; SIVI: v_rcp_iflag_f32_e32 [[RCP0:v[0-9]+]] +; SIVI-DAG: v_mul_f32_e32 [[RCP1:v[0-9]+]], 0x4f800000, [[RCP0]] +; SIVI-DAG: v_cvt_u32_f32_e32 [[RCP:v[0-9]+]], [[RCP1]] +; SIVI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] +; SIVI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] +; SIVI: v_cmp_eq_i32_e{{64|32}} [[ZERO_RCP_HI:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[RCP_HI]] +; SIVI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] +; SIVI-DAG: v_cndmask_b32_e64 [[RCP_C_LO:v[0-9]+]], [[RCP_LO]], [[NEG_RCP_LO]], [[ZERO_RCP_HI]] +; SIVI-DAG: v_mul_hi_u32 [[E:v[0-9]+]], [[RCP_C_LO]], [[RCP]] +; SIVI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]] +; SIVI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]] +; SIVI-DAG: v_cndmask_b32_e64 [[RCP_C_E:v[0-9]+]], [[RCP_S_E]], [[RCP_A_E]], [[ZERO_RCP_HI]] +; SIVI-DAG: v_mul_hi_u32 [[Quotient:v[0-9]+]], [[RCP_C_E]], {{s[0-9]+}} +; SIVI-DAG: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]], [[Quotient]], {{s[0-9]+}} +; SIVI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{s[0-9]+}}, [[Num_S_Remainder]] +; SIVI-DAG: v_cmp_ge_u32_e{{64|32}} [[GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], {{s[0-9]+}}, [[Num_S_Remainder]] +; SIVI: v_cndmask_b32_e64 [[C_GE_S_Rem:v[0-9]+]], 0, -1, [[GE_S_Rem]] +; SIVI-DAG: v_cmp_le_u32_e{{32|64}} [[LE_Rem:s\[[0-9]+:[0-9]+\]|vcc]], {{s[0-9]+}}, [[Remainder]] +; SIVI-DAG: v_cndmask_b32_e64 [[C_LE_Rem:v[0-9]+]], 0, -1, [[LE_Rem]] +; SI-DAG: v_cmp_eq_i32_e{{64|32}} [[EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[C_GE_S_Rem]] +; VI: v_cmp_eq_i32_e{{64|32}} [[EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[C_GE_S_Rem]] +; SIVI-DAG: v_and_b32_e32 [[Tmp1:v[0-9]+]], [[C_GE_S_Rem]], [[C_LE_Rem]] +; SIVI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] +; SIVI-DAG: v_cmp_eq_i32_e64 [[EQ0_Tmp1:s\[[0-9]+:[0-9]+\]]], 0, [[Tmp1]] +; SIVI-DAG: v_add_i32_e32 [[Quotient_A_NegOne:v[0-9]+]], vcc, -1, [[Quotient]] +; SIVI-DAG: v_cndmask_b32_e64 [[Quotient_B:v[0-9]+]], [[Quotient_A_One]], [[Quotient]], [[EQ0_Tmp1]] +; SIVI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], vcc, {{s[0-9]+}}, [[Remainder]] +; SIVI-DAG: v_cndmask_b32_e64 [[Tmp2:v[0-9]+]], [[Quotient_S_One]], [[Remainder]], [[EQ0_Tmp1]] +; SIVI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], vcc, {{s[0-9]+}}, [[Remainder]] +; SIVI-DAG: v_cndmask_b32_e{{64|32}} [[Tmp3:v[0-9]+]], [[Quotient_B]], [[Quotient_A_NegOne]] +; SIVI-DAG: v_cndmask_b32_e{{64|32}} [[Tmp4:v[0-9]+]], [[Tmp2]], [[Remainder_A_Den]] +; SIVI: s_endpgm define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { %result0 = udiv i32 %x, %y store i32 %result0, i32 addrspace(1)* %out @@ -107,53 +115,98 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm +; SIVI: v_rcp_iflag_f32_e32 [[RCP0:v[0-9]+]] +; SIVI-DAG: v_mul_f32_e32 [[RCP1:v[0-9]+]], 0x4f800000, [[RCP0]] +; SIVI-DAG: v_cvt_u32_f32_e32 [[RCP:v[0-9]+]], [[RCP1]] +; SIVI-DAG: v_rcp_iflag_f32_e32 [[E1_RCP0:v[0-9]+]] +; SIVI-DAG: v_mul_f32_e32 [[E1_RCP1:v[0-9]+]], 0x4f800000, [[E1_RCP0]] +; SIVI-DAG: v_cvt_u32_f32_e32 [[E1_RCP:v[0-9]+]], [[E1_RCP1]] + +; SIVI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]], [[TS11:s[0-9]+]] +; SIVI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]], [[TS11]] +; SIVI-DAG: v_mul_hi_u32 [[E1_RCP_HI:v[0-9]+]], [[E1_RCP]], [[E1_TS11:s[0-9]+]] +; SIVI-DAG: v_mul_lo_i32 [[E1_RCP_LO:v[0-9]+]], [[E1_RCP]], [[E1_TS11]] + +; SIVI-DAG: v_cmp_eq_i32_e{{64|32}} [[ZERO_RCP_HI:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[RCP_HI]] +; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] +; VI: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] +; SIVI-DAG: v_cndmask_b32_e64 [[RCP_C_LO:v[0-9]+]], [[RCP_LO]], [[NEG_RCP_LO]], [[ZERO_RCP_HI]] +; SIVI-DAG: v_mul_hi_u32 [[E:v[0-9]+]], [[RCP_C_LO]], [[RCP]] +; SIVI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]] +; SIVI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]] +; SIVI-DAG: v_cndmask_b32_e64 [[RCP_C_E:v[0-9]+]], [[RCP_S_E]], [[RCP_A_E]], [[ZERO_RCP_HI]] + +; SIVI-DAG: v_cmp_eq_i32_e{{64|32}} [[E1_ZERO_RCP_HI:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[E1_RCP_HI]] +; SIVI-DAG: v_sub_i32_e32 [[E1_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[E1_RCP_LO]] +; SIVI-DAG: v_cndmask_b32_e64 [[E1_RCP_C_LO:v[0-9]+]], [[E1_RCP_LO]], [[E1_NEG_RCP_LO]], [[E1_ZERO_RCP_HI]] +; SI: v_mul_hi_u32 [[E1_E:v[0-9]+]], [[E1_RCP_C_LO]], [[E1_RCP]] +; VI-DAG: v_mul_hi_u32 [[E1_E:v[0-9]+]], [[E1_RCP_C_LO]], [[E1_RCP]] +; SIVI-DAG: v_add_i32_e32 [[E1_RCP_A_E:v[0-9]+]], vcc, [[E1_E]], [[E1_RCP]] +; SIVI-DAG: v_subrev_i32_e32 [[E1_RCP_S_E:v[0-9]+]], vcc, [[E1_E]], [[E1_RCP]] +; SIVI-DAG: v_cndmask_b32_e64 [[E1_RCP_C_E:v[0-9]+]], [[E1_RCP_S_E]], [[E1_RCP_A_E]], [[E1_ZERO_RCP_HI]] + +; SIVI-DAG: v_mul_hi_u32 [[Quotient:v[0-9]+]], [[RCP_C_E]], [[TS3:s[0-9]+]] +; SIVI-DAG: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]], [[Quotient]], [[TS11]] +; SIVI-DAG: v_mul_hi_u32 [[E1_Quotient:v[0-9]+]], [[E1_RCP_C_E]], [[E1_TS3:s[0-9]+]] +; SIVI-DAG: v_mul_lo_i32 [[E1_Num_S_Remainder:v[0-9]+]], [[E1_Quotient]], [[E1_TS11]] + +; SIVI-DAG: v_cmp_ge_u32_e{{64|32}} [[GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[TS3]], [[Num_S_Remainder]] +; SIVI-DAG: v_cndmask_b32_e64 [[C_GE_S_Rem:v[0-9]+]], 0, -1, [[GE_S_Rem]] +; SIVI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, [[TS3]], [[Num_S_Remainder]] +; SIVI-DAG: v_cmp_le_u32_e{{32|64}} [[LE_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[TS11]], [[Remainder]] +; SI: v_cndmask_b32_e64 [[C_LE_Rem:v[0-9]+]], 0, -1, [[LE_Rem]] +; VI-DAG: v_cndmask_b32_e64 [[C_LE_Rem:v[0-9]+]], 0, -1, [[LE_Rem]] +; VI-DAG: v_and_b32_e32 [[Tmp1:v[0-9]+]], [[C_GE_S_Rem]], [[C_LE_Rem]] +; VI-DAG: v_cmp_eq_i32_e64 [[EQ0_Tmp1:s\[[0-9]+:[0-9]+\]]], 0, [[Tmp1]] +; VI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] +; VI-DAG: v_cndmask_b32_e64 [[Quotient_B:v[0-9]+]], [[Quotient_A_One]], [[Quotient]], [[EQ0_Tmp1]] +; VI-DAG: v_add_i32_e32 [[Quotient_A_NegOne:v[0-9]+]], vcc, -1, [[Quotient]] +; VI-DAG: v_cmp_eq_i32_e{{64|32}} [[EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[C_GE_S_Rem]] +; VI-DAG: v_cndmask_b32_e{{64|32}} [[Tmp3:v[0-9]+]], [[Quotient_B]], [[Quotient_A_NegOne]] + +; SI-DAG: v_sub_i32_e32 [[E1_Remainder:v[0-9]+]], vcc, [[E1_TS3]], [[E1_Num_S_Remainder]] +; SI-DAG: v_cmp_ge_u32_e{{64|32}} [[E1_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[E1_TS3]], [[E1_Num_S_Remainder]] +; SI-DAG: v_cndmask_b32_e64 [[E1_C_GE_S_Rem:v[0-9]+]], 0, -1, [[E1_GE_S_Rem]] +; SI-DAG: v_cmp_le_u32_e{{32|64}} [[E1_LE_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[E1_TS11]], [[E1_Remainder]] +; SI: v_cndmask_b32_e64 [[E1_C_LE_Rem:v[0-9]+]], 0, -1, [[E1_LE_Rem]] +; VI-DAG: v_cmp_ge_u32_e{{64|32}} [[E1_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[E1_TS3]], [[E1_Num_S_Remainder]] +; VI: v_cndmask_b32_e64 [[E1_C_GE_S_Rem:v[0-9]+]], 0, -1, [[E1_GE_S_Rem]] +; VI-DAG: v_sub_i32_e32 [[E1_Remainder:v[0-9]+]], vcc, [[E1_TS3]], [[E1_Num_S_Remainder]] +; VI-DAG: v_cmp_le_u32_e{{32|64}} [[E1_LE_Rem:s\[[0-9]+:[0-9]+\]|vcc]], [[E1_TS11]], [[E1_Remainder]] +; VI-DAG: v_cndmask_b32_e64 [[E1_C_LE_Rem:v[0-9]+]], 0, -1, [[E1_LE_Rem]] +; VI-DAG: v_and_b32_e32 [[E1_Tmp1:v[0-9]+]], [[E1_C_GE_S_Rem]], [[E1_C_LE_Rem]] +; VI-DAG: v_cmp_eq_i32_e64 [[E1_EQ0_Tmp1:s\[[0-9]+:[0-9]+\]]], 0, [[E1_Tmp1]] +; VI-DAG: v_add_i32_e32 [[E1_Quotient_A_One:v[0-9]+]], vcc, 1, [[E1_Quotient]] +; VI-DAG: v_cndmask_b32_e64 [[E1_Quotient_B:v[0-9]+]], [[E1_Quotient_A_One]], [[E1_Quotient]], [[E1_EQ0_Tmp1]] +; VI-DAG: v_add_i32_e32 [[E1_Quotient_A_NegOne:v[0-9]+]], vcc, -1, [[E1_Quotient]] +; VI-DAG: v_cmp_eq_i32_e{{64|32}} [[E1_EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[E1_C_GE_S_Rem]] +; VI-DAG: v_cndmask_b32_e{{64|32}} [[E1_Tmp3:v[0-9]+]], [[E1_Quotient_B]], [[E1_Quotient_A_NegOne]] + +; SI-DAG: v_and_b32_e32 [[Tmp1:v[0-9]+]], [[C_GE_S_Rem]], [[C_LE_Rem]] +; SI-DAG: v_cmp_eq_i32_e64 [[EQ0_Tmp1:s\[[0-9]+:[0-9]+\]]], 0, [[Tmp1]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] +; SI-DAG: v_cndmask_b32_e64 [[Quotient_B:v[0-9]+]], [[Quotient_A_One]], [[Quotient]], [[EQ0_Tmp1]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_NegOne:v[0-9]+]], vcc, -1, [[Quotient]] +; SI-DAG: v_cmp_eq_i32_e{{64|32}} [[EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[C_GE_S_Rem]] +; SI-DAG: v_cndmask_b32_e{{64|32}} [[Tmp3:v[0-9]+]], [[Quotient_B]], [[Quotient_A_NegOne]] + +; SI-DAG: v_and_b32_e32 [[E1_Tmp1:v[0-9]+]], [[E1_C_GE_S_Rem]], [[E1_C_LE_Rem]] +; SI-DAG: v_cmp_eq_i32_e64 [[E1_EQ0_Tmp1:s\[[0-9]+:[0-9]+\]]], 0, [[E1_Tmp1]] +; SI-DAG: v_add_i32_e32 [[E1_Quotient_A_One:v[0-9]+]], vcc, 1, [[E1_Quotient]] +; SI-DAG: v_cndmask_b32_e64 [[E1_Quotient_B:v[0-9]+]], [[E1_Quotient_A_One]], [[E1_Quotient]], [[E1_EQ0_Tmp1]] +; SI-DAG: v_add_i32_e32 [[E1_Quotient_A_NegOne:v[0-9]+]], vcc, -1, [[E1_Quotient]] +; SI-DAG: v_cmp_eq_i32_e{{64|32}} [[E1_EQ0_C_GE_S_Rem:s\[[0-9]+:[0-9]+\]|vcc]], 0, [[E1_C_GE_S_Rem]] +; SI-DAG: v_cndmask_b32_e{{64|32}} [[E1_Tmp3:v[0-9]+]], [[E1_Quotient_B]], [[E1_Quotient_A_NegOne]] + +; SIVI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], vcc, [[TS11]], [[Remainder]] +; SIVI-DAG: v_cndmask_b32_e64 [[Tmp2:v[0-9]+]], [[Quotient_S_One]], [[Remainder]] +; SIVI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], vcc, [[TS11]], [[Remainder]] +; SIVI-DAG: v_cndmask_b32_e{{64|32}} [[Tmp4:v[0-9]+]], [[Tmp2]], [[Remainder_A_Den]] +; SIVI-DAG: v_subrev_i32_e32 [[E1_Quotient_S_One:v[0-9]+]], vcc, [[E1_TS11]], [[E1_Remainder]] +; SIVI-DAG: v_cndmask_b32_e64 [[E1_Tmp2:v[0-9]+]], [[E1_Quotient_S_One]], [[E1_Remainder]], [[E1_EQ0_Tmp1]] +; SIVI-DAG: v_add_i32_e32 [[E1_Remainder_A_Den:v[0-9]+]], vcc, [[E1_TS11]], [[E1_Remainder]] +; SIVI-DAG: v_cndmask_b32_e{{64|32}} [[E1_Tmp4:v[0-9]+]], [[E1_Tmp2]], [[E1_Remainder_A_Den]] +; SIVI: s_endpgm define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, <2 x i32> addrspace(1)* %out @@ -257,85 +310,85 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -; SI-DAG: v_rcp_iflag_f32_e32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_sub_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_sub_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_sub_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_sub_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_add_i32_e32 -; SI-DAG: v_subrev_i32_e32 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm +; SIVI-DAG: v_rcp_iflag_f32_e32 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_sub_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_and_b32_e32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_rcp_iflag_f32_e32 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_sub_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_and_b32_e32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_rcp_iflag_f32_e32 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_sub_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_and_b32_e32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_rcp_iflag_f32_e32 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_mul_lo_i32 +; SIVI-DAG: v_sub_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI-DAG: v_mul_hi_u32 +; SIVI-DAG: v_add_i32_e32 +; SIVI-DAG: v_subrev_i32_e32 +; SIVI-DAG: v_cndmask_b32_e64 +; SIVI: s_endpgm define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { %result0 = udiv <4 x i32> %x, %y store <4 x i32> %result0, <4 x i32> addrspace(1)* %out Index: test/CodeGen/AMDGPU/uint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -13,16 +13,22 @@ ; FUNC-LABEL: {{^}}v_uint_to_fp_i64_to_f32: ; GCN: {{buffer|flat}}_load_dwordx2 -; GCN: v_ffbh_u32 -; GCN: v_ffbh_u32 -; GCN: v_cndmask -; GCN: v_cndmask +; GCN: v_ffbh_u32_{{[^ ]*}} [[V4:v[0-9]+]], v[[N2:[0-9]+]] +; GCN-DAG: v_ffbh_u32_{{[^ ]*}} v[[N5:[0-9]+]], v[[N3:[0-9]+]] +; GCN-DAG: v_add_i32_{{[^ ]*}} [[V4A:v[0-9]+]], vcc, 32, [[V4]] +; GCN-DAG: v_cndmask_b32_{{[^ ]*}} [[V4B:v[0-9]+]], v[[N5]], [[V4A]] -; GCN-DAG: v_cmp_eq_i64 -; GCN-DAG: v_cmp_lt_u64 +; SI-DAG: v_lshl_{{[^ ]*}} v{{\[[0-9]+}}:[[N3A:[0-9]+]]{{\]}}, v{{\[}}[[N2]]:[[N3]]{{\]}}, [[V4B]] +; VI-DAG: v_lshlrev_{{[^ ]*}} v{{\[[0-9]+}}:[[N3A:[0-9]+]]{{\]}}, [[V4B]], v{{\[}}[[N2]]:[[N3]]{{\]}} + +; GCN-DAG: v_sub_i32_{{[^ ]*}} [[V4C:v[0-9]+]], vcc, 0xbe, [[V4B]] +; GCN-DAG: v_and_b32_{{[^ ]*}} v[[N6:[0-9]+]], 0xff, v[[N3A]] +; GCN-DAG: v_cndmask_b32_{{[^ ]*}} {{v[0-9]+}}, 0, [[V4C]] +; GCN-DAG: v_cmp_eq_i64_{{[^ ]*}} {{s\[[0-9]+:[0-9]+\]|vcc}}, {{s\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[N6]]{{\]}} +; GCN-DAG: v_cmp_lt_u64_{{[^ ]*}} {{s\[[0-9]+:[0-9]+\]|vcc}}, {{s\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[N6]]{{\]}} ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] -; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]] +; GCN-DAG: {{buffer|flat}}_store_dword {{.*}}[[VR]] define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -28,8 +28,8 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] @@ -68,8 +68,8 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] @@ -82,8 +82,8 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -27,10 +27,10 @@ ; VIMESA-NEXT: s_mov_b32 s15, 0x980000 -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, [[S1215:s\[[0-9]+:[0-9]+\]]], [[S16:s[0-9]+]] offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} -; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, [[S1215]], [[S16]] offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, [[S1215]], [[S16]] offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -18,8 +18,8 @@ ; VI-NEXT: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], [[S11:s[0-9]+]] offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, s[12:15], [[S11]] offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/vselect.ll =================================================================== --- test/CodeGen/AMDGPU/vselect.ll +++ test/CodeGen/AMDGPU/vselect.ll @@ -7,7 +7,7 @@ ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e{{64|32}} ;SI: v_cndmask_b32_e32 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { @@ -25,7 +25,7 @@ ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e{{64|32}} ;SI: v_cndmask_b32_e32 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -6,9 +6,9 @@ ; for the original bug. ; GCN: {{^}}test: -; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] +; GCN: flat_store_dword [[ADDR:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}} ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] +; GCN: flat_load_dword {{v[0-9]+}}, [[ADDR]] define void @test(i32 addrspace(1)* %out, i32 %in) { store volatile i32 0, i32 addrspace(1)* %out %val = load volatile i32, i32 addrspace(1)* %out Index: test/MC/AMDGPU/trap.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/trap.s @@ -0,0 +1,99 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=SICI +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=SICI +// RUN: llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding %s | FileCheck %s --check-prefix=VI + +//===----------------------------------------------------------------------===// +// Trap Handler related - 32 bit registers +//===----------------------------------------------------------------------===// + +s_add_u32 ttmp0, ttmp0, 4 +// SICI: s_add_u32 ttmp0, ttmp0, 4 ; encoding: [0x70,0x84,0x70,0x80] +// VI: s_add_u32 ttmp0, ttmp0, 4 ; encoding: [0x70,0x84,0x70,0x80] + +s_add_u32 ttmp4, 8, ttmp4 +// SICI: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x74,0x74,0x80] +// VI: s_add_u32 ttmp4, 8, ttmp4 ; encoding: [0x88,0x74,0x74,0x80] + +s_add_u32 ttmp4, ttmp4, 0x00000100 +// SICI: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x74,0xff,0x74,0x80,0x00,0x01,0x00,0x00] +// VI: s_add_u32 ttmp4, ttmp4, 0x100 ; encoding: [0x74,0xff,0x74,0x80,0x00,0x01,0x00,0x00] + +s_add_u32 ttmp4, ttmp4, 4 +// SICI: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x74,0x84,0x74,0x80] +// VI: s_add_u32 ttmp4, ttmp4, 4 ; encoding: [0x74,0x84,0x74,0x80] + +s_add_u32 ttmp4, ttmp8, ttmp4 +// SICI: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x78,0x74,0x74,0x80] +// VI: s_add_u32 ttmp4, ttmp8, ttmp4 ; encoding: [0x78,0x74,0x74,0x80] + +s_and_b32 ttmp10, ttmp8, 0x00000080 +// SICI: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x78,0xff,0x7a,0x87,0x80,0x00,0x00,0x00] +// VI: s_and_b32 ttmp10, ttmp8, 0x80 ; encoding: [0x78,0xff,0x7a,0x86,0x80,0x00,0x00,0x00] + +s_and_b32 ttmp9, tma_hi, 0x0000ffff +// SICI: s_and_b32 ttmp9, tma_hi, 0xffff ; encoding: [0x6f,0xff,0x79,0x87,0xff,0xff,0x00,0x00] +// VI: s_and_b32 ttmp9, tma_hi, 0xffff ; encoding: [0x6f,0xff,0x79,0x86,0xff,0xff,0x00,0x00] + +s_and_b32 ttmp9, ttmp9, 0x000001ff +// SICI: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x79,0xff,0x79,0x87,0xff,0x01,0x00,0x00] +// VI: s_and_b32 ttmp9, ttmp9, 0x1ff ; encoding: [0x79,0xff,0x79,0x86,0xff,0x01,0x00,0x00] + +s_and_b32 ttmp9, tma_lo, 0xffff0000 +// SICI: s_and_b32 ttmp9, tma_lo, 0xffff0000 ; encoding: [0x6e,0xff,0x79,0x87,0x00,0x00,0xff,0xff] +// VI: s_and_b32 ttmp9, tma_lo, 0xffff0000 ; encoding: [0x6e,0xff,0x79,0x86,0x00,0x00,0xff,0xff] + +s_and_b32 ttmp9, ttmp9, ttmp8 +// SICI: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x79,0x78,0x79,0x87] +// VI: s_and_b32 ttmp9, ttmp9, ttmp8 ; encoding: [0x79,0x78,0x79,0x86] + +s_and_b32 ttmp8, ttmp1, 0x01000000 +// SICI: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x71,0xff,0x78,0x87,0x00,0x00,0x00,0x01] +// VI: s_and_b32 ttmp8, ttmp1, 0x1000000 ; encoding: [0x71,0xff,0x78,0x86,0x00,0x00,0x00,0x01] + +s_cmp_eq_i32 ttmp8, 0 +// SICI: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x78,0x80,0x00,0xbf] +// VI: s_cmp_eq_i32 ttmp8, 0 ; encoding: [0x78,0x80,0x00,0xbf] + +s_cmp_eq_i32 ttmp8, 0x000000fe +// SICI: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x78,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] +// VI: s_cmp_eq_i32 ttmp8, 0xfe ; encoding: [0x78,0xff,0x00,0xbf,0xfe,0x00,0x00,0x00] + +s_lshr_b32 ttmp8, ttmp8, 12 +// SICI: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x78,0x8c,0x78,0x90] +// VI: s_lshr_b32 ttmp8, ttmp8, 12 ; encoding: [0x78,0x8c,0x78,0x8f] + +s_mov_b32 m0, ttmp8 +// SICI: s_mov_b32 m0, ttmp8 ; encoding: [0x78,0x03,0xfc,0xbe] +// VI: s_mov_b32 m0, ttmp8 ; encoding: [0x78,0x00,0xfc,0xbe] + +s_mov_b32 ttmp10, 0 +// SICI: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x03,0xfa,0xbe] +// VI: s_mov_b32 ttmp10, 0 ; encoding: [0x80,0x00,0xfa,0xbe] + +s_mov_b32 ttmp11, 0x01024fac +// SICI: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x03,0xfb,0xbe,0xac,0x4f,0x02,0x01] +// VI: s_mov_b32 ttmp11, 0x1024fac ; encoding: [0xff,0x00,0xfb,0xbe,0xac,0x4f,0x02,0x01] + +s_mov_b32 ttmp8, m0 +// SICI: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x03,0xf8,0xbe] +// VI: s_mov_b32 ttmp8, m0 ; encoding: [0x7c,0x00,0xf8,0xbe] + +s_mov_b32 ttmp8, tma_lo +// SICI: s_mov_b32 ttmp8, tma_lo ; encoding: [0x6e,0x03,0xf8,0xbe] +// VI: s_mov_b32 ttmp8, tma_lo ; encoding: [0x6e,0x00,0xf8,0xbe] + +s_mul_i32 ttmp8, 0x00000324, ttmp8 +// SICI: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x78,0x78,0x93,0x24,0x03,0x00,0x00] +// VI: s_mul_i32 ttmp8, 0x324, ttmp8 ; encoding: [0xff,0x78,0x78,0x92,0x24,0x03,0x00,0x00] + +s_or_b32 ttmp9, ttmp9, 0x00280000 +// SICI: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x79,0xff,0x79,0x88,0x00,0x00,0x28,0x00] +// VI: s_or_b32 ttmp9, ttmp9, 0x280000 ; encoding: [0x79,0xff,0x79,0x87,0x00,0x00,0x28,0x00] + +//===----------------------------------------------------------------------===// +// Trap Handler related - Pairs and quadruples of registers +//===----------------------------------------------------------------------===// + +s_mov_b64 ttmp[4:5], exec +// SICI: s_mov_b64 ttmp[4:5], exec ; encoding: [0x7e,0x04,0xf4,0xbe] +// VI: s_mov_b64 ttmp[4:5], exec ; encoding: [0x7e,0x01,0xf4,0xbe] \ No newline at end of file