Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -312,6 +312,12 @@ return; } + if (MI->getOpcode() == AMDGPU::DELETED) { + if (isVerbose()) + OutStreamer->emitRawComment(" deleted instruction"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -148,8 +148,8 @@ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::i16, &AMDGPU::SGPR_LO16RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SGPR_LO16RegClass); // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -83,6 +83,12 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); +static cl::opt Fix16BitCopies( + "amdgpu-fix-16-bit-physreg-copies", + cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), + cl::init(true), + cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -527,6 +533,22 @@ MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); + // FIXME: This is hack to resolve copies between 16 bit and 32 bit + // registers until all patterns are fixed. + if (Fix16BitCopies && + ((RI.getRegSizeInBits(*RC) == 16) ^ + (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { + MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; + RegToFix = RI.get32BitRegister(RegToFix); + + if (DestReg == SrcReg) { + BuildMI(MBB, MI, DL, get(AMDGPU::DELETED)); + return; + } + + RC = RI.getPhysRegClass(DestReg); + } + if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || @@ -2494,7 +2506,11 @@ return false; unsigned Opc = UseMI.getOpcode(); - if (Opc == AMDGPU::COPY) { + // FIXME: it is still possible to fold immediate even with a subreg. + if (Opc == AMDGPU::COPY && + (!UseMI.getOperand(1).getSubReg() && + RI.getRegSizeInBits( + *RI.getRegClassForReg(*MRI, UseMI.getOperand(0).getReg())) == 32)) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -258,6 +258,13 @@ let Size = 0; } +def DELETED : SPseudoInstSI<(outs), (ins), []> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let FixedSize = 1; + let Size = 0; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. Index: llvm/test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -283,15 +283,17 @@ ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; FLAT-NEXT: v_mov_b32_e32 v0, 0x10203 -; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s4, 0x10203 +; FLAT-NEXT: s_mov_b32 s5, 0xf0f0f0f ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, s2, v0 -; FLAT-NEXT: v_perm_b32 v4, 0, s3, v0 +; FLAT-NEXT: v_mov_b32_e32 v0, s2 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_perm_b32 v2, 0, v0, s4 +; FLAT-NEXT: v_perm_b32 v4, 0, v1, s4 ; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v1, s4, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s4, v4 +; FLAT-NEXT: v_and_b32_e32 v1, s5, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s5, v4 ; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 ; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] @@ -519,33 +521,37 @@ ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s10, 0x55555555 +; FLAT-NEXT: s_mov_b32 s8, 0x10203 +; FLAT-NEXT: s_mov_b32 s9, 0x33333333 +; FLAT-NEXT: s_mov_b32 s10, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s11, 0x55555555 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 -; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 +; FLAT-NEXT: v_mov_b32_e32 v0, s2 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_perm_b32 v3, 0, v0, s8 +; FLAT-NEXT: v_perm_b32 v2, 0, v1, s8 +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f0 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_mov_b32_e32 v4, s0 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 -; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 +; FLAT-NEXT: v_perm_b32 v7, 0, v4, s8 +; FLAT-NEXT: v_mov_b32_e32 v4, s1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 +; FLAT-NEXT: v_perm_b32 v6, 0, v4, s8 +; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 +; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -554,15 +560,15 @@ ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa -; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 +; FLAT-NEXT: s_mov_b32 s12, 0xaaaaaaaa +; FLAT-NEXT: v_and_b32_e32 v0, s11, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s11, v3 +; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 +; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s12, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s12, v3 +; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 +; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -570,10 +576,10 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s10, v0 -; FLAT-NEXT: v_and_b32_e32 v6, s11, v0 -; FLAT-NEXT: v_and_b32_e32 v7, s11, v7 +; FLAT-NEXT: v_and_b32_e32 v5, s11, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s11, v0 +; FLAT-NEXT: v_and_b32_e32 v6, s12, v0 +; FLAT-NEXT: v_and_b32_e32 v7, s12, v7 ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 Index: llvm/test/CodeGen/AMDGPU/bswap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bswap.ll +++ llvm/test/CodeGen/AMDGPU/bswap.ll @@ -227,7 +227,7 @@ ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x10203 +; VI-NEXT: s_mov_b32 s8, 0x10203 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,8 +235,10 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v1, 0, s4, v0 -; VI-NEXT: v_perm_b32 v0, 0, s5, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_perm_b32 v1, 0, v0, s8 +; VI-NEXT: v_perm_b32 v0, 0, v2, s8 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load i64, i64 addrspace(1)* %in, align 8 @@ -273,7 +275,7 @@ ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x10203 +; VI-NEXT: s_mov_b32 s8, 0x10203 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -281,10 +283,14 @@ ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v3, 0, s6, v0 -; VI-NEXT: v_perm_b32 v2, 0, s7, v0 -; VI-NEXT: v_perm_b32 v1, 0, s4, v0 -; VI-NEXT: v_perm_b32 v0, 0, s5, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_perm_b32 v3, 0, v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_perm_b32 v2, 0, v1, s8 +; VI-NEXT: v_perm_b32 v1, 0, v4, s8 +; VI-NEXT: v_perm_b32 v0, 0, v0, s8 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 @@ -333,25 +339,33 @@ ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v4, 0x10203 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s12, 0x10203 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v3, 0, s2, v4 -; VI-NEXT: v_perm_b32 v2, 0, s3, v4 -; VI-NEXT: v_perm_b32 v1, 0, s0, v4 -; VI-NEXT: v_perm_b32 v0, 0, s1, v4 -; VI-NEXT: v_perm_b32 v7, 0, s6, v4 -; VI-NEXT: v_perm_b32 v6, 0, s7, v4 -; VI-NEXT: v_perm_b32 v5, 0, s4, v4 -; VI-NEXT: v_perm_b32 v4, 0, s5, v4 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_perm_b32 v2, 0, v1, s12 +; VI-NEXT: v_perm_b32 v1, 0, v4, s12 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_perm_b32 v7, 0, v4, s12 +; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_perm_b32 v6, 0, v4, s12 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_perm_b32 v5, 0, v4, s12 +; VI-NEXT: v_mov_b32_e32 v4, s9 +; VI-NEXT: v_perm_b32 v3, 0, v0, s12 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_perm_b32 v4, 0, v4, s12 +; VI-NEXT: v_perm_b32 v0, 0, v0, s12 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone Index: llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -124,11 +124,11 @@ ; GCN: s_cbranch_scc{{[0-1]}} ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f +; VI: v_and_b32_e32 v{{[0-9]+}}, 0x7f ; GCN: BB2_3: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff +; VI: v_and_b32_e32 v{{[0-9]+}}, 0xff ; GCN: buffer_store_short ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll +++ llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll @@ -8,6 +8,8 @@ ; GFX9-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v1 +; GFX9-NEXT: ; deleted instruction +; GFX9-NEXT: ; deleted instruction ; GFX9-NEXT: s_setpc_b64 s[30:31] %A = sext i8 %X to i48 %B = sext i8 %Y to i48 @@ -29,6 +31,8 @@ ; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; GFX9-NEXT: ; deleted instruction +; GFX9-NEXT: ; deleted instruction ; GFX9-NEXT: s_setpc_b64 s[30:31] %A = sext i8 %X to i48 %B = sext i8 %Y to i48 Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -107,6 +107,8 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4 ; GCN-NEXT: v_writelane_b32 v32, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: ; deleted instruction +; GCN-NEXT: ; deleted instruction ; GCN-NEXT: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 Index: llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -147,10 +147,13 @@ } ; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f16: -; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} +; GCN: s_load_dword [[X:s[0-9]+]], +; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X]], s{{[0-9]+}} -; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 -; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] + +; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], [[VX]], 2.0, {{[vs][0-9]+}} +; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[VX]], 2.0, {{[vs][0-9]+}} ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] ; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} @@ -169,10 +172,13 @@ } ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f16: -; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| +; GCN: s_load_dword [[X:s[0-9]+]], +; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X]]|, |s{{[0-9]+}}| + +; VI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] -; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} -; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} +; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[VX]]|, 2.0, {{[vs][0-9]+}} +; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[VX]]|, 2.0, {{[vs][0-9]+}} ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] ; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} @@ -234,8 +240,11 @@ } ; GCN-LABEL: {{^}}fmul_x2_xn3_f16: -; SIVI: v_mov_b32_e32 [[K:v[0-9]+]], 0xc600 -; SIVI: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] +; GCN: s_load_dword [[X:s[0-9]+]], +; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0xc600 +; SI: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X]], [[K]] +; VI: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] +; VI: v_mul_f16_e32 [[TMP0:v[0-9]+]], 0xc600, [[VX]] ; GFX10: v_mul_f16_e64 [[TMP0:v[0-9]+]], 0xc600, [[X:s[0-9]+]] ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] Index: llvm/test/CodeGen/AMDGPU/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fshr.ll +++ llvm/test/CodeGen/AMDGPU/fshr.ll @@ -811,9 +811,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-NEXT: v_pk_lshrrev_b16 v7, v6, v2 ; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1] -; GFX9-NEXT: s_mov_b32 s6, 0xf000f ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 @@ -827,7 +826,7 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v4, v2, v3 ; GFX9-NEXT: v_pk_sub_i16 v2, 16, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s6, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_and_b32_e32 v2, v8, v6 @@ -947,9 +946,8 @@ ; GFX9-NEXT: v_lshl_or_b32 v6, v8, 16, v6 ; GFX9-NEXT: v_pk_lshrrev_b16 v8, v6, v3 ; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1] -; GFX9-NEXT: s_mov_b32 s6, 0xf000f ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v6, v1 -; GFX9-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 0xf000f, v5 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 @@ -965,7 +963,7 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v5, v3, v2 ; GFX9-NEXT: v_pk_sub_i16 v3, 16, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 -; GFX9-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc Index: llvm/test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/function-returns.ll +++ llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -434,9 +434,15 @@ ; FIXME: Should pack ; GCN-LABEL: {{^}}v4i8_func_void: ; GCN: buffer_load_dword v0 -; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 +; CI-DAG: v_lshrrev_b32_e32 v1, 8, v0 +; CI-DAG: v_lshrrev_b32_e32 v2, 16, v0 +; CI-DAG: v_lshrrev_b32_e32 v3, 24, v0 + +; FIXME: should shift right into resulting registers + +; GFX89-DAG: v_lshrrev_b32_e32 v2, 16, v0 +; GFX89-DAG: v_lshrrev_b32_e32 v3, 8, v0 +; GFX89-DAG: v_lshrrev_b32_e32 v0, 24, v0 ; GCN: s_setpc_b64 define <4 x i8> @v4i8_func_void() #0 { %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(4)* undef Index: llvm/test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -265,31 +265,34 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_sdwa v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_add_u16_e32 v8, 1, v4 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], s4, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[0:1], v6, v5, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v6, v7, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v6 +; GFX9-NEXT: v_mad_f32 v6, -v6, v0, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, v0 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: global_store_short v[4:5], v6, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -316,12 +319,13 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_movk_i32 s6, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -341,7 +345,9 @@ ; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: ; deleted instruction ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 +; GFX9-NEXT: ; deleted instruction ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -368,36 +374,39 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5 +; GFX9-NEXT: v_add_u16_e32 v7, 1, v4 ; GFX9-NEXT: v_xor_b32_e32 v8, s2, v5 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8 -; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], s4, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[0:1], v6, v5, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v6, v9, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v7 +; GFX9-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 30, v8 +; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 +; GFX9-NEXT: v_mad_f32 v6, -v6, v0, v9 +; GFX9-NEXT: v_or_b32_e32 v7, 1, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: global_store_short v[4:5], v6, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -423,38 +432,41 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1 +; GFX9-NEXT: v_add_u16_e32 v8, 1, v4 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], s4, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[0:1], v6, v5, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v6, v10, v1 ; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 30, v9 +; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v6 +; GFX9-NEXT: v_mad_f32 v6, -v6, v0, v10 +; GFX9-NEXT: v_or_b32_e32 v8, 1, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v8, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, s2 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v6, v7, v6 +; GFX9-NEXT: global_store_short v[4:5], v6, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1866,35 +1866,39 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off +; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s5, 16, v5 +; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_e32 v3, v2, v3 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1902,35 +1906,39 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX9-DL-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-DL-NEXT: v_and_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, s5, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2051,28 +2059,26 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_lshr_b32 s4, s1, 24 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_e32 v4, s3, v4 +; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -26,8 +26,8 @@ ; GCN-LABEL: {{^}}class_f16_fabs: ; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]] +; GCN: v_mov_b32_e32 [[V_A_F16:v[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[V_A_F16]]|, s[[SB_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -48,8 +48,8 @@ ; GCN-LABEL: {{^}}class_f16_fneg: ; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]] +; GCN: v_mov_b32_e32 [[V_A_F16:v[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[V_A_F16]], s[[SB_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -70,8 +70,8 @@ ; GCN-LABEL: {{^}}class_f16_fabs_fneg: ; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]] +; GCN: v_mov_b32_e32 [[V_A_F16:v[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[V_A_F16]]|, s[[SB_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -124,9 +124,10 @@ ; GCN-LABEL: {{^}}class_f16_full_mask: ; GCN: s_load_dword s[[SA_F16:[0-9]+]] -; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] -; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc +; VI: s_movk_i32 s[[MASK:[0-9]+]], 0x3ff{{$}} +; VI: v_mov_b32_e32 v[[V_A_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[V_A_F16]], s[[MASK]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm define amdgpu_kernel void @class_f16_full_mask( @@ -141,9 +142,10 @@ ; GCN-LABEL: {{^}}class_f16_nine_bit_mask: ; GCN: s_load_dword s[[SA_F16:[0-9]+]] -; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] -; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc +; VI: s_movk_i32 s[[MASK:[0-9]+]], 0x1ff{{$}} +; VI: v_mov_b32_e32 v[[V_A_F16:[0-9]+]], s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[V_A_F16]], s[[MASK]] +; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm define amdgpu_kernel void @class_f16_nine_bit_mask( Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll @@ -288,7 +288,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_ogt: -; VI: v_cmp_gt_f16_e64 +; VI: v_cmp_lt_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -300,7 +300,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_oge: -; VI: v_cmp_ge_f16_e64 +; VI: v_cmp_le_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -312,7 +312,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_olt: -; VI: v_cmp_lt_f16_e64 +; VI: v_cmp_gt_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -324,7 +324,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_ole: -; VI: v_cmp_le_f16_e64 +; VI: v_cmp_ge_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -360,7 +360,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_ugt: -; VI: v_cmp_nle_f16_e64 +; VI: v_cmp_nge_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -372,7 +372,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_uge: -; VI: v_cmp_nlt_f16_e64 +; VI: v_cmp_ngt_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -384,7 +384,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_ult: -; VI: v_cmp_nge_f16_e64 +; VI: v_cmp_nle_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} @@ -396,7 +396,7 @@ } ; GCN-LABEL: {{^}}v_fcmp_f16_ule: -; VI: v_cmp_ngt_f16_e64 +; VI: v_cmp_nlt_f16_e64 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -203,7 +203,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_ugt: -; VI: v_cmp_gt_u16_e64 +; VI: v_cmp_lt_u16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_and_b32 [[CVT:s[0-9]+]], s{{[0-9]+}}, 0xffff{{$}} @@ -215,7 +215,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_uge: -; VI: v_cmp_ge_u16_e64 +; VI: v_cmp_le_u16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_and_b32 [[CVT:s[0-9]+]], s{{[0-9]+}}, 0xffff{{$}} @@ -227,7 +227,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_ult: -; VI: v_cmp_lt_u16_e64 +; VI: v_cmp_gt_u16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_and_b32 [[CVT:s[0-9]+]], s{{[0-9]+}}, 0xffff{{$}} @@ -239,7 +239,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_ule: -; VI: v_cmp_le_u16_e64 +; VI: v_cmp_ge_u16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_and_b32 [[CVT:s[0-9]+]], s{{[0-9]+}}, 0xffff{{$}} @@ -251,7 +251,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_sgt: -; VI: v_cmp_gt_i16_e64 +; VI: v_cmp_lt_i16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}} @@ -263,7 +263,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_sge: -; VI: v_cmp_ge_i16_e64 +; VI: v_cmp_le_i16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}} @@ -275,7 +275,7 @@ } ; GCN-LABEL: {{^}}v_icmp_i16_slt: -; VI: v_cmp_lt_i16_e64 +; VI: v_cmp_gt_i16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}} @@ -286,7 +286,7 @@ ret void } ; GCN-LABEL: {{^}}v_icmp_i16_sle: -; VI: v_cmp_le_i16_e64 +; VI: v_cmp_ge_i16_e64 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x64 ; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -11,6 +11,8 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 +; TONGA-NEXT: ; deleted instruction +; TONGA-NEXT: ; deleted instruction ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; @@ -21,6 +23,8 @@ ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX81-NEXT: s_nop 0 ; GFX81-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 +; GFX81-NEXT: ; deleted instruction +; GFX81-NEXT: ; deleted instruction ; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; @@ -30,6 +34,8 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 +; GFX9-NEXT: ; deleted instruction +; GFX9-NEXT: ; deleted instruction ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -40,6 +46,8 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; deleted instruction +; GFX10-NEXT: ; deleted instruction ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -52,15 +60,15 @@ ; TONGA: ; %bb.0: ; %main_body ; TONGA-NEXT: s_mov_b64 s[14:15], exec ; TONGA-NEXT: s_wqm_b64 exec, exec -; TONGA-NEXT: v_mov_b32_e32 v2, 0 -; TONGA-NEXT: v_mov_b32_e32 v4, s12 -; TONGA-NEXT: v_mov_b32_e32 v5, s13 -; TONGA-NEXT: v_mov_b32_e32 v3, v2 +; TONGA-NEXT: v_mov_b32_e32 v4, 0 +; TONGA-NEXT: v_mov_b32_e32 v2, s12 +; TONGA-NEXT: v_mov_b32_e32 v3, s13 +; TONGA-NEXT: v_mov_b32_e32 v5, v4 ; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] -; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; TONGA-NEXT: image_sample v[4:5], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_mov_b32_e32 v0, v2 -; TONGA-NEXT: flat_store_dword v[4:5], v3 +; TONGA-NEXT: v_mov_b32_e32 v0, v4 +; TONGA-NEXT: flat_store_dword v[2:3], v5 ; TONGA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; @@ -68,18 +76,18 @@ ; GFX81: ; %bb.0: ; %main_body ; GFX81-NEXT: s_mov_b64 s[14:15], exec ; GFX81-NEXT: s_wqm_b64 exec, exec -; GFX81-NEXT: v_mov_b32_e32 v2, 0 -; GFX81-NEXT: v_mov_b32_e32 v4, s12 -; GFX81-NEXT: v_mov_b32_e32 v5, s13 -; GFX81-NEXT: v_mov_b32_e32 v3, v2 +; GFX81-NEXT: v_mov_b32_e32 v4, 0 +; GFX81-NEXT: v_mov_b32_e32 v2, s12 +; GFX81-NEXT: v_mov_b32_e32 v3, s13 +; GFX81-NEXT: v_mov_b32_e32 v5, v4 ; GFX81-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX81-NEXT: s_nop 0 -; GFX81-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX81-NEXT: image_sample v[4:5], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 ; GFX81-NEXT: s_waitcnt vmcnt(0) -; GFX81-NEXT: v_mov_b32_e32 v0, v2 +; GFX81-NEXT: v_mov_b32_e32 v0, v4 ; GFX81-NEXT: s_nop 0 ; GFX81-NEXT: s_nop 0 -; GFX81-NEXT: flat_store_dword v[4:5], v3 +; GFX81-NEXT: flat_store_dword v[2:3], v5 ; GFX81-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; @@ -87,15 +95,15 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX9-NEXT: image_sample v[4:5], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: global_store_dword v[4:5], v3, off +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: global_store_dword v[2:3], v5, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; Test combine to reduce the width of a 64-bit shift to 32-bit if ; truncated to 16-bit. Index: llvm/test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -149,11 +149,12 @@ ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; SI-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] ; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] ; SI: v_cmp_ne_u32_e32 vcc, [[B]], [[VK255]] -; VI-DAG: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] +; VI-DAG: v_mov_b32_e32 [[VVALUE:v[0-9]+]], [[VALUE]] +; VI-DAG: v_and_b32_e32 [[B:v[0-9]+]], [[K255]], [[VVALUE]] ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc