Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -101,7 +101,7 @@ std::pair foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; - + bool isVGPRImm(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; @@ -2068,6 +2068,56 @@ return isExtractHiElt(In, Src); } +bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return false; + } + const SIRegisterInfo *SIRI = + static_cast(Subtarget->getRegisterInfo()); + const SIInstrInfo * SII = + static_cast(Subtarget->getInstrInfo()); + + unsigned Limit = 0; + bool AllUsesAcceptSReg = true; + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + Limit < 10 && U != E; ++U, ++Limit) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + + // If the register class is unknown, it could be an unknown + // register class that needs to be an SGPR, e.g. an inline asm + // constraint + if (!RC || SIRI->isSGPRClass(RC)) + return false; + + if (RC != &AMDGPU::VS_32RegClass) { + AllUsesAcceptSReg = false; + SDNode * User = *U; + if (User->isMachineOpcode()) { + unsigned Opc = User->getMachineOpcode(); + MCInstrDesc Desc = SII->get(Opc); + if (Desc.isCommutable()) { + unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; + if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { + unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); + const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); + if (CommutedRC == &AMDGPU::VS_32RegClass) + AllUsesAcceptSReg = true; + } + } + } + // If "AllUsesAcceptSReg == false" so far we haven't suceeded + // commuting current user. This means have at least one use + // that strictly require VGPR. Thus, we will not attempt to commute + // other user instructions. + if (!AllUsesAcceptSReg) + break; + } + } + return !AllUsesAcceptSReg && (Limit < 10); +} + + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -227,6 +227,9 @@ bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0, + unsigned & SrcOpIdx1) const; + bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1421,10 +1421,15 @@ // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - if (!MI.isCommutable()) + return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); +} + +bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { + if (!Desc.isCommutable()) return false; - unsigned Opc = MI.getOpcode(); + unsigned Opc = Desc.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -495,24 +495,7 @@ }]>; class VGPRImm : PatLeafgetGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } - const SIRegisterInfo *SIRI = - static_cast(Subtarget->getRegisterInfo()); - unsigned Limit = 0; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - Limit < 10 && U != E; ++U, ++Limit) { - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - - // If the register class is unknown, it could be an unknown - // register class that needs to be an SGPR, e.g. an inline asm - // constraint - if (!RC || SIRI->isSGPRClass(RC)) - return false; - } - - return Limit < 10; + return isVGPRImm(N); }]>; def NegateImm : SDNodeXForm @v_exp_v2f32(<2 x float> %arg0) { -; SI-LABEL: v_exp_v2f32: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b -; SI-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-NEXT: v_mul_f32_e32 v1, v1, v2 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: v_exp_f32_e32 v1, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_exp_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b -; VI-NEXT: v_mul_f32_e32 v0, v0, v2 -; VI-NEXT: v_mul_f32_e32 v1, v1, v2 -; VI-NEXT: v_exp_f32_e32 v0, v0 -; VI-NEXT: v_exp_f32_e32 v1, v1 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_exp_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b +; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.exp.v2f32(<2 x float> %arg0) ret <2 x float> %result } define <3 x float> @v_exp_v3f32(<3 x float> %arg0) { -; SI-LABEL: v_exp_v3f32: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; SI-NEXT: v_mul_f32_e32 v0, v0, v3 -; SI-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 -; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: v_exp_f32_e32 v1, v1 -; SI-NEXT: v_exp_f32_e32 v2, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b +; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: v_exp_f32_e32 v2, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_exp_v3f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; VI-NEXT: v_mul_f32_e32 v0, v0, v3 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 -; VI-NEXT: v_mul_f32_e32 v2, v2, v3 -; VI-NEXT: v_exp_f32_e32 v0, v0 -; VI-NEXT: v_exp_f32_e32 v1, v1 -; VI-NEXT: v_exp_f32_e32 v2, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_exp_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v3 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %arg0) ret <3 x float> %result } @@ -106,44 +63,16 @@ ; SI-LABEL: v_exp_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b -; SI-NEXT: v_mul_f32_e32 v0, v0, v4 -; SI-NEXT: v_mul_f32_e32 v1, v1, v4 -; SI-NEXT: v_mul_f32_e32 v2, v2, v4 -; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b +; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0 +; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1 +; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2 +; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: v_exp_f32_e32 v2, v2 ; SI-NEXT: v_exp_f32_e32 v3, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_exp_v4f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b -; VI-NEXT: v_mul_f32_e32 v0, v0, v4 -; VI-NEXT: v_mul_f32_e32 v1, v1, v4 -; VI-NEXT: v_mul_f32_e32 v2, v2, v4 -; VI-NEXT: v_mul_f32_e32 v3, v3, v4 -; VI-NEXT: v_exp_f32_e32 v0, v0 -; VI-NEXT: v_exp_f32_e32 v1, v1 -; VI-NEXT: v_exp_f32_e32 v2, v2 -; VI-NEXT: v_exp_f32_e32 v3, v3 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_exp_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_exp_f32_e32 v3, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.exp.v4f32(<4 x float> %arg0) ret <4 x float> %result } @@ -181,11 +110,11 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b +; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} +; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}} ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -193,19 +122,20 @@ ; VI-LABEL: v_exp_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5 -; VI-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v0, v0, v1 -; VI-NEXT: v_exp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_exp_f16_e32 v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 +; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; VI-NEXT: v_mul_f16_sdwa [[MUL1:v[0-9]+]], v{{[0-9]+}}, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v{{[0-9]+}} +; VI-NEXT: v_exp_f16_sdwa [[MUL1]], [[MUL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_exp_f16_e32 [[MUL2]], [[MUL2]] +; VI-NEXT: v_or_b32_e32 v{{[0-9]+}}, [[MUL2]], [[MUL1]] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_exp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3dc5 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] +; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0] ; GFX9-NEXT: v_exp_f16_e32 v1, v0 ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -228,15 +158,15 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b +; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, v0, v4 -; SI-NEXT: v_mul_f32_e32 v1, v1, v4 -; SI-NEXT: v_mul_f32_e32 v2, v2, v4 -; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0 +; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1 +; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2 +; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: v_exp_f32_e32 v2, v2 @@ -246,36 +176,37 @@ ; VI-LABEL: v_exp_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, 0x3dc5 -; VI-NEXT: v_mul_f16_e32 v3, v1, v2 -; VI-NEXT: v_mul_f16_e32 v4, v0, v2 -; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_exp_f16_e32 v3, v3 -; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_exp_f16_e32 v4, v4 -; VI-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 +; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; VI-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1 +; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0 +; VI-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]] +; VI-NEXT: v_exp_f16_sdwa [[EXP2:v[0-9]+]], v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]] +; VI-NEXT: v_exp_f16_sdwa [[EXP4:v[0-9]+]], v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, [[EXP1]], [[EXP2]] +; VI-NEXT: v_or_b32_e32 v0, [[EXP3]], [[EXP4]] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_exp_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3dc5 -; GFX9-NEXT: v_mul_f16_e32 v3, v1, v2 -; GFX9-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_exp_f16_e32 v3, v3 -; GFX9-NEXT: v_exp_f16_e32 v4, v4 -; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_exp_f16_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5 +; GFX9-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1 +; GFX9-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0 +; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]] +; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]] +; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff +; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]] +; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]] +; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]] +; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0) ret <4 x half> %result Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -498,7 +498,7 @@ ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983 ; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]] ; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]] @@ -520,7 +520,7 @@ ; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983 ; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]] ; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494 @@ -660,7 +660,7 @@ } ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -326,15 +326,17 @@ ; GCN-LABEL: {{^}}commute_add_literal_v2f16: ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}} ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} +; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}} ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; gfx8 does not support sreg or imm in sdwa - this will be move then +; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -446,7 +446,7 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] @@ -474,7 +474,7 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -127,9 +127,9 @@ ; SI-LABEL: {{^}}v_test_class_full_mask_f32: ; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -165,7 +165,7 @@ ; FIXME: Why isn't this using a literal constant operand? ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: ; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000 ; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] @@ -284,10 +284,10 @@ ; SI-LABEL: {{^}}v_test_class_full_mask_f64: ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] +; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]] ; SI-NOT: vcc -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { @@ -377,8 +377,8 @@ ; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: ; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} +; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -26,8 +26,8 @@ ; GCN-LABEL: {{^}}div_fixup_f16_imm_a ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], s[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_a( @@ -45,8 +45,8 @@ ; GCN-LABEL: {{^}}div_fixup_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_b( @@ -64,8 +64,8 @@ ; GCN-LABEL: {{^}}div_fixup_f16_imm_c ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_c( @@ -81,9 +81,9 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b -; VI-DAG: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}} +; VI-DAG: s_movk_i32 [[AB_F16:s[0-9]+]], 0x4200{{$}} ; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]] -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], [[AB_F16]], [[AB_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b( @@ -97,9 +97,9 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c -; VI-DAG: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}} +; VI-DAG: s_movk_i32 [[BC_F16:s[0-9]+]], 0x4200{{$}} ; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]] -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], [[BC_F16]], [[BC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c( @@ -113,9 +113,9 @@ } ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c -; VI-DAG: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}} +; VI-DAG: s_movk_i32 [[AC_F16:s[0-9]+]], 0x4200{{$}} ; GCN-DAG: buffer_load_ushort v[[B_F16:[0-9]+]] -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], [[AC_F16]], v[[B_F16]], [[AC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c( Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -366,8 +366,8 @@ } ; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val: -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], v{{[0-9]+}}, [[K]] define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -376,8 +376,8 @@ } ; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val: -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], v{{[0-9]+}} define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll @@ -274,9 +274,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_oeq: ; VI: v_cmp_eq_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_oeq(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i64 %result, i64 addrspace(1)* %out @@ -286,9 +286,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_one: ; VI: v_cmp_neq_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_one(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i64 %result, i64 addrspace(1)* %out @@ -298,9 +298,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ogt: ; VI: v_cmp_gt_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_gt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ogt(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i64 %result, i64 addrspace(1)* %out @@ -310,9 +310,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_oge: ; VI: v_cmp_ge_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_ge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_le_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_oge(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i64 %result, i64 addrspace(1)* %out @@ -322,9 +322,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_olt: ; VI: v_cmp_lt_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_gt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_olt(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i64 %result, i64 addrspace(1)* %out @@ -334,9 +334,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ole: ; VI: v_cmp_le_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_le_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_ge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ole(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i64 %result, i64 addrspace(1)* %out @@ -346,9 +346,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ueq: ; VI: v_cmp_nlg_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_nlg_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_nlg_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ueq(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i64 %result, i64 addrspace(1)* %out @@ -358,9 +358,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_une: ; VI: v_cmp_neq_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_une(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i64 %result, i64 addrspace(1)* %out @@ -370,9 +370,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ugt: ; VI: v_cmp_nle_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_nle_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_nge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ugt(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i64 %result, i64 addrspace(1)* %out @@ -382,9 +382,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_uge: ; VI: v_cmp_nlt_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_nlt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_ngt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_uge(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i64 %result, i64 addrspace(1)* %out @@ -394,9 +394,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ult: ; VI: v_cmp_nge_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_nge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_nle_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ult(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i64 %result, i64 addrspace(1)* %out @@ -406,9 +406,9 @@ ; GCN-LABEL: {{^}}v_fcmp_f16_ule: ; VI: v_cmp_ngt_f16_e64 -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_ngt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]] +; SI: v_cmp_nlt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]] define amdgpu_kernel void @v_fcmp_f16_ule(i64 addrspace(1)* %out, half %src) { %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i64 %result, i64 addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll @@ -34,7 +34,7 @@ } ; GCN-LABEL: {{^}}mad_f16_imm_b: -; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x4800 +; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800 ; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]], ; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]], define amdgpu_kernel void @mad_f16_imm_b( Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll @@ -21,8 +21,7 @@ } ; GCN-LABEL: {{^}}mad_f32_imm_a: -; GCN: v_mov_b32_e32 [[KA:v[0-9]+]], 0x41000000 -; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, [[KA]], +; GCN: v_madmk_f32 {{v[0-9]+}}, {{v[0-9]+}}, 0x41000000, define amdgpu_kernel void @mad_f32_imm_a( float addrspace(1)* %r, float addrspace(1)* %b, Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -53,8 +53,8 @@ } ; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0: -; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] +; GCN: s_brev_b32 [[NEG0:s[0-9]+]], 1 +; GCN: v_med3_f32 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0) %neg.med3 = fsub float -0.0, %med3 @@ -88,8 +88,8 @@ ; GCN-LABEL: {{^}}test_fneg_fmed3_r_inv2pi_0_foldable_user: ; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983 -; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]] +; GCN-DAG: s_mov_b32 [[NEG_INV:s[0-9]+]], 0xbe22f983 +; GCN: v_med3_f32 [[MED3:v[0-9]+]], -v{{[0-9]+}}, [[NEG_INV]], [[NEG0]] ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]] define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0) Index: test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -212,7 +212,7 @@ ; SI-LABEL: {{^}}fcmp_x2: ; FIXME: LLVM should be able to combine these fcmp opcodes. -; SI: v_cmp_gt_f32 +; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0 ; SI: v_cndmask_b32 ; SI: v_cmpx_le_f32 define amdgpu_ps void @fcmp_x2(float %a) #0 { Index: test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_lerp: -; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_lerp_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind { %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_msad_u8: -; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_msad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_hi_u8: -; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_sad_hi_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_u16: -; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_sad_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_u8: -; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_sad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -25,14 +25,14 @@ ; GCN-LABEL: {{^}}cos_v2f16 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}} +; SI-DAG: s_mov_b32 [[HALF_PI:s[0-9]+]], 0x3e22f983{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], [[HALF_PI]], v[[A_F32_0]] ; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], [[HALF_PI]], v[[A_F32_1]] ; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -33,13 +33,13 @@ ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} +; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_a( @@ -56,13 +56,13 @@ ; GCN-LABEL: {{^}}fma_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} +; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_b( @@ -79,13 +79,13 @@ ; GCN-LABEL: {{^}}fma_f16_imm_c ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} +; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] +; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_c( @@ -154,8 +154,8 @@ ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} -; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} +; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} +; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] @@ -164,13 +164,13 @@ ; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], s[[A_F32]], v[[C_F32_1]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], s[[A_F32]], v[[C_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and @@ -195,8 +195,8 @@ ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} -; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} +; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} +; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -205,15 +205,15 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], s[[B_F32]], v[[C_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], s[[B_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and @@ -238,8 +238,8 @@ ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} -; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} +; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} +; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -250,8 +250,8 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] -; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] +; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], s[[C_F32]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], s[[C_F32]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] @@ -260,8 +260,8 @@ ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], s[[C_F16]] ; GCN-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -49,7 +49,7 @@ ; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] ; VI-FLUSH: buffer_store_short v[[C_F16]] -; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 +; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] @@ -77,7 +77,7 @@ ; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] ; VI-FLUSH: buffer_store_short v[[C_F16]] -; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 +; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] ; VI-DENORM buffer_store_short [[RESULT]] Index: test/CodeGen/AMDGPU/llvm.log.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log.f16.ll +++ test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -32,23 +32,24 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x3f317218 -; VIGFX9: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x398c +; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 +; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], v[[R_F32_1]], v[[A_F32_2]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], v[[R_F32_0]], v[[A_F32_2]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] -; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], v[[A_F32_2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], v[[R_F16_2]], v[[A_F32_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], v[[R_F16_0]], v[[A_F32_2]] +; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] +; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] Index: test/CodeGen/AMDGPU/llvm.log.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log.ll +++ test/CodeGen/AMDGPU/llvm.log.ll @@ -31,12 +31,11 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218 -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218 -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log.v2f32(<2 x float> %in) @@ -67,16 +66,15 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218 -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218 -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log.v4f32(<4 x float> %in) Index: test/CodeGen/AMDGPU/llvm.log10.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -32,23 +32,24 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x3e9a209a -; VIGFX9: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x34d1 +; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a +; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], v[[R_F32_1]], v[[A_F32_2]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], v[[R_F32_0]], v[[A_F32_2]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] -; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], v[[A_F32_2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], v[[R_F16_2]], v[[A_F32_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], v[[R_F16_0]], v[[A_F32_2]] +; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] +; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] Index: test/CodeGen/AMDGPU/llvm.log10.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log10.ll +++ test/CodeGen/AMDGPU/llvm.log10.ll @@ -31,12 +31,11 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log10.v2f32(<2 x float> %in) @@ -67,16 +66,15 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] -; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]] +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log10.v4f32(<4 x float> %in) Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -25,14 +25,14 @@ ; GCN-LABEL: {{^}}sin_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}} +; SI: s_mov_b32 [[HALF_PI:s[0-9]+]], 0x3e22f983{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]] +; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], [[HALF_PI]], v[[A_F32_0]] ; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]] +; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], [[HALF_PI]], v[[A_F32_1]] ; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] ; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] Index: test/CodeGen/AMDGPU/mad-mix.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix.ll +++ test/CodeGen/AMDGPU/mad-mix.ll @@ -214,9 +214,9 @@ ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1: ; GCN: s_waitcnt -; GFX9: v_mov_b32_e32 v2, 1.0 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; CIVI: v_mad_f32 v0, v0, v1, 1.0 ; GCN-NEXT: s_setpc_b64 @@ -229,9 +229,9 @@ ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; GCN: s_waitcnt -; GFX9: v_mov_b32_e32 v2, 0.15915494 -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; VI: v_mad_f32 v0, v0, v1, 0.15915494 define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { %src0.ext = fpext half %src0 to float @@ -246,9 +246,9 @@ ; fpext f16 1/2pi = 0x3e230000 ; f32 1/2pi = 0x3e22f983 ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; GFX9: v_mov_b32_e32 v2, 0x3e230000 -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; CIVI: v_madak_f32 v0, v0, v1, 0x3e230000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { @@ -260,9 +260,9 @@ } ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; GFX9: v_mov_b32_e32 v2, 0x367c0000 -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x367c0000 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; CIVI: v_madak_f32 v0, v0, v1, 0x367c0000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { @@ -274,13 +274,13 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: -; GFX9: v_mov_b32_e32 v3, 1.0 -; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0 +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -290,14 +290,14 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX9: v_mov_b32_e32 v3, 0x3e230000 +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000 -; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -308,14 +308,14 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: -; GFX9: v_mov_b32_e32 v3, 0.15915494 +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494 -; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding ; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -31,9 +31,9 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]] -; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 +; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]] ; GCN: s_endpgm define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -94,8 +94,10 @@ } ; GCN-LABEL: {{^}}v_s_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 +; GCN: s_load_dword [[SREG:s[0-9]+]] +; GCN: buffer_load_dword [[VREG1:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG]] +; GCN: v_mac_f32_e32 [[VREG2]], 0x41200000, [[VREG1]] ; GCN: s_endpgm define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -128,8 +130,8 @@ ; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[VK]], [[VB]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 +; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]] define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -150,7 +152,7 @@ ; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -170,8 +172,8 @@ ; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: ; GCN: buffer_load_dword [[A:v[0-9]+]] -; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[VK]], 2.0 +; GCN: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 +; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[SK]], 2.0 define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -36,7 +36,7 @@ ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], -; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b +; SI-DAG: s_mov_b32 [[MAGIC:s[0-9]+]], 0x98a1930b ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]] ; SI: v_add_{{[iu]}}32 ; SI: v_lshrrev_b32 Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -197,14 +197,14 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_gt_f32_e32 +; SI: v_cmp_lt_f32_e32 ; SI: v_cndmask_b32_e32 - ; SI: v_cmp_lt_f32_e32 vcc, 0.5 +; SI: v_cmp_lt_f32_e32 vcc, 0.5 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_lt_f16_e32 ; VI: v_cndmask_b32_e32 -; VI: v_cmp_gt_f16_e32 +; VI: v_cmp_lt_f16_e32 ; VI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 @@ -233,14 +233,14 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 +; SI: v_cmp_gt_f32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cmp_gt_f32_e32 vcc, 0.5 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_gt_f16_e32 ; VI: v_cndmask_b32_e32 -; VI: v_cmp_lt_f16_e32 +; VI: v_cmp_gt_f16_e32 ; VI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 Index: test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- test/CodeGen/AMDGPU/setcc-opt.ll +++ test/CodeGen/AMDGPU/setcc-opt.ll @@ -182,7 +182,7 @@ ; FUNC-LABEL: {{^}}v_cmp_sext_k_neg1_i8_sext_arg: ; GCN: v_cmp_ne_u32_e32 vcc, -1, v0 -; GCN-NEXT: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[SELECT]] define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind { %b.ext = sext i8 %b to i32 Index: test/CodeGen/AMDGPU/srem.ll =================================================================== --- test/CodeGen/AMDGPU/srem.ll +++ test/CodeGen/AMDGPU/srem.ll @@ -19,7 +19,7 @@ } ; FUNC-LABEL: {{^}}srem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 +; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x92492493 ; SI: v_mul_hi_i32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]] ; SI: v_mul_lo_i32 ; SI: v_sub_{{[iu]}}32 Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -4,22 +4,22 @@ target triple="amdgcn--" ; CHECK-LABEL: foobar: -; CHECK: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK: BB0_1: -; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr2_sgpr3 killed $exec +; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: BB0_2: -; CHECK: s_or_b64 exec, exec, s[2:3] +; CHECK: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; CHECK-NEXT: s_endpgm define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -73,7 +73,7 @@ ; FUNC-LABEL: {{^}}udiv_i32_div_k_even: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xfabbd9c1 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] @@ -87,7 +87,7 @@ ; FUNC-LABEL: {{^}}udiv_i32_div_k_odd: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7d5deca3 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] Index: test/CodeGen/AMDGPU/urem.ll =================================================================== --- test/CodeGen/AMDGPU/urem.ll +++ test/CodeGen/AMDGPU/urem.ll @@ -19,8 +19,8 @@ } ; FUNC-LABEL: {{^}}test_urem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 -; SI: v_mul_hi_u32 [[MAGIC]], {{v[0-9]+}} +; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925 +; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]] ; SI: v_subrev_{{[iu]}}32 ; SI: v_mul_lo_i32 ; SI: v_sub_{{[iu]}}32 Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -135,8 +135,9 @@ ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s: ; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]] +; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR]] ; GCN: buffer_store_dword [[RESULT0]] define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 @@ -145,11 +146,12 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR0]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SK]], [[SK]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -163,8 +165,9 @@ ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k: ; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] +; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 @@ -173,11 +176,12 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -191,8 +195,9 @@ ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k: ; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] +; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 @@ -201,11 +206,12 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm @@ -220,12 +226,13 @@ ; GCN-LABEL: {{^}}test_s0_s1_k_f32: ; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 +; GCN-DAG: s_mov_b32 [[SK0:s[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: v_mov_b32_e32 [[VS0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS0]], [[VS1]], [[SK0]] +; GCN-DAG: s_mov_b32 [[SK1:s[0-9]+]], 0x45800000 +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VS0]], [[VS1]], [[SK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -9,6 +9,7 @@ ; waitcnt should be inserted after exec modification ; SI: v_cmp_lt_i32_e32 vcc, 0, +; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -645,11 +645,11 @@ ; CHECK: image_store ; CHECK: s_wqm_b64 exec, exec ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 -; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000 +; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] -; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]] +; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] ; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break