Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -382,17 +382,32 @@ FlatUsed = true; continue; + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("Trap Handler registers should not be used"); + continue; + default: break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { + if (AMDGPU::TTMP_32RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + if (AMDGPU::TTMP_64RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 2; } else if (AMDGPU::VReg_64RegClass.contains(reg)) { Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -546,8 +546,10 @@ } -static int getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { +typedef enum { IS_VGPR, IS_SGPR, IS_TTMP } registerKind; + +static int getRegClass(registerKind Is, unsigned RegWidth) { + if (Is == IS_VGPR) { switch (RegWidth) { default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; @@ -557,16 +559,24 @@ case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; } + } else if (Is == IS_TTMP) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::TTMP_32RegClassID; + case 2: return AMDGPU::TTMP_64RegClassID; +// case 4: return AMDGPU::TTMP_128RegClassID; + } + } else if (Is == IS_SGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } } - - switch (RegWidth) { - default: return -1; - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } + return -1; } static unsigned getRegForName(StringRef RegName) { @@ -583,6 +593,10 @@ .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) .Default(0); } @@ -600,21 +614,21 @@ return !subtargetHasRegister(*TRI, RegNo); } - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') + // Match vgprs, sgprs and ttmps + if (RegName[0] != 's' && RegName[0] != 'v' && !RegName.startswith("ttmp")) return true; - bool IsVgpr = RegName[0] == 'v'; + const registerKind Is = RegName[0] == 'v' ? IS_VGPR : RegName[0] == 's' ? IS_SGPR : IS_TTMP; unsigned RegWidth; unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register + if (RegName.size() > (Is == IS_TTMP ? strlen("ttmp") : 1) ) { + // We have a single 32-bit register. Syntax: vXX RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + if (RegName.substr(Is == IS_TTMP ? strlen("ttmp") : 1).getAsInteger(10, RegIndexInClass)) return true; Parser.Lex(); } else { - // We have a register greater than 32-bits. + // We have a register greater than 32-bits (a range of single registers). Syntax: v[XX:YY] int64_t RegLo, RegHi; Parser.Lex(); @@ -637,11 +651,11 @@ Parser.Lex(); RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { + if (Is == IS_VGPR) { // VGPR registers aren't aligned. RegIndexInClass = RegLo; } else { - // SGPR registers are aligned. Max alignment is 4 dwords. + // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords. unsigned Size = std::min(RegWidth, 4u); if (RegLo % Size != 0) return true; @@ -650,7 +664,7 @@ } } - int RCID = getRegClass(IsVgpr, RegWidth); + int RCID = getRegClass(Is, RegWidth); if (RCID == -1) return true; Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -179,6 +179,18 @@ case AMDGPU::VCC_HI: O << "vcc_hi"; return; + case AMDGPU::TBA_LO: + O << "tba_lo"; + return; + case AMDGPU::TBA_HI: + O << "tba_hi"; + return; + case AMDGPU::TMA_LO: + O << "tma_lo"; + return; + case AMDGPU::TMA_HI: + O << "tma_hi"; + return; case AMDGPU::EXEC_LO: O << "exec_lo"; return; @@ -207,9 +219,12 @@ } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { Type = 'v'; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) { Type = 's'; NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) { + Type = 't'; + NumRegs = 2; } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { Type = 'v'; NumRegs = 4; @@ -239,12 +254,22 @@ // The low 8 bits of the encoding value is the register index, for both VGPRs // and SGPRs. unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (Type == 't') // Trap temps start at offset 112. TODO: Get this from tablegen. + RegIdx -= 112; // FIXME hack. if (NumRegs == 1) { - O << Type << RegIdx; + if (Type == 't') // FIXME hack + O << "ttmp"; + else + O << Type; + O << RegIdx; return; } - O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; + if (Type == 't') // FIXME hack + O << "ttmp"; + else + O << Type; + O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -90,6 +90,16 @@ reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // Reserve Trap Handler registers - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::TBA); + reserveRegisterTuples(Reserved, AMDGPU::TMA); + reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); + reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); + reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); + reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); + reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); + reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); + // Reserve the last 2 registers so we will always have at least 2 more that // will physically contain VCC. reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); @@ -573,7 +583,21 @@ switch(Channel) { case 0: return AMDGPU::VCC_LO; case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); + default: llvm_unreachable("Invalid SubIdx for VCC"); break; + } + + case AMDGPU::TBA: + switch(Channel) { + case 0: return AMDGPU::TBA_LO; + case 1: return AMDGPU::TBA_HI; + default: llvm_unreachable("Invalid SubIdx for TBA"); break; + } + + case AMDGPU::TMA: + switch(Channel) { + case 0: return AMDGPU::TMA_LO; + case 1: return AMDGPU::TMA_HI; + default: llvm_unreachable("Invalid SubIdx for TMA"); break; } case AMDGPU::FLAT_SCR: Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,40 @@ def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +// Trap handler registers +def TBA_LO : SIReg<"tba_lo", 108>; +def TBA_HI : SIReg<"tba_hi", 109>; + +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 108; +} + +def TMA_LO : SIReg<"tma_lo", 110>; +def TMA_HI : SIReg<"tma_hi", 111>; + +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, + DwarfRegAlias { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 110; +} + +def TTMP0 : SIReg <"ttmp0", 112>; +def TTMP1 : SIReg <"ttmp1", 113>; +def TTMP2 : SIReg <"ttmp2", 114>; +def TTMP3 : SIReg <"ttmp3", 115>; +def TTMP4 : SIReg <"ttmp4", 116>; +def TTMP5 : SIReg <"ttmp5", 117>; +def TTMP6 : SIReg <"ttmp6", 118>; +def TTMP7 : SIReg <"ttmp7", 119>; +def TTMP8 : SIReg <"ttmp8", 120>; +def TTMP9 : SIReg <"ttmp9", 121>; +def TTMP10 : SIReg <"ttmp10", 122>; +def TTMP11 : SIReg <"ttmp11", 123>; + multiclass FLAT_SCR_LOHI_m ci_e, bits<16> vi_e> { def _ci : SIReg; def _vi : SIReg; @@ -135,6 +169,24 @@ (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// Trap handler TMP 32-bit registers +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "TTMP%u", 0, 11))> { + let isAllocatable = 0; +} + +// Trap handler TMP 64-bit registers +def TTMP_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate TTMP_32, 2)), + (add (decimate (shl TTMP_32, 1), 2))]>; + +// Trap handler TMP 128-bit registers +def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4))]>; + // VGPR 32-bit registers def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "VGPR%u", 0, 255))>; @@ -199,15 +251,24 @@ // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI) >; def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { + let isAllocatable = 0; +} + def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR) + (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64) >; +//def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)>; +// +//def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { + def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { // Requires 2 s_mov_b64 to copy let CopyCost = 2; Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -256,8 +256,8 @@ } ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} -; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} ; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]] @@ -483,8 +483,8 @@ ; low 32-bits, which is not a valid 64-bit inline immmediate. ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: -; SI: s_load_dwordx2 -; SI: s_load_dword s +; SI-DAG: s_load_dwordx2 +; SI-DAG: s_load_dword s ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -3,11 +3,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 @@ -23,10 +23,10 @@ ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -10,7 +10,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -36,7 +36,8 @@ ; FUNC-LABEL: {{^}}v_ctlz_i32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]] +; FIXME v_ffbh_u32 does not look correct for v_ctlz_i32. +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm @@ -99,7 +100,7 @@ ; FUNC-LABEL: {{^}}v_ctlz_i8: ; SI: buffer_load_ubyte [[VAL:v[0-9]+]], ; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]] +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]] ; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]] ; SI: buffer_store_byte [[RESULT]], @@ -137,14 +138,19 @@ ; FUNC-LABEL: {{^}}v_ctlz_i64: ; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] +; SI-DAG: v_cmp_eq_i32_e{{32|64}} [[CMPHI:s\[[0-9]+:[0-9]+\]|vcc]], 0, v[[HI]] +; FIXME: Not checked: When CMPHI != vcc, src3 of the next instruction is not verified. +; FIXME: Reason: regex can not contain variables. +; FIXME: Alternatively, VI prefix can be used for tonga, but that would +; FIXME: require duplication of almost all SI checks except this one or +; FIXME: moving this test to separate .ll file. Both look like overkill. +; SI-DAG: v_cndmask_b32_e{{64|32}} v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]] ; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]] -; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc +; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ]], 64, vcc ; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -32,10 +32,11 @@ ; SI-LABEL: {{^}}load_v3i8_to_v3f32: ; SI-NOT: bfe ; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte0_e32 -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI-DAG: v_cvt_f32_ubyte2_e32 v[[WORD2:[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[WORD1:[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[WORD0:[0-9]+]] +; SI: buffer_store_dword v[[WORD2]], +; SI: buffer_store_dwordx2 v{{\[}}[[WORD0]]:[[WORD1]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -7,8 +7,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN-DAG: s_load_dwordx2 [[SA_LO:s\[[0-9]+:]]{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; GCN-DAG: s_load_dwordx2 [[SB_LO:s\[[0-9]+:]]{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, [[SA_LO]]{{[0-9]+\]}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, [[SB_LO]]{{[0-9]+\]}} ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -81,8 +83,10 @@ ; an inline immediate. ; GCN-LABEL: {{^}}madak_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, s[4:7] +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, s[8:11] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone