diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -124,7 +124,8 @@ void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist) const; + SetVectorType &Worklist, + Register NewCond = Register()) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -637,6 +637,12 @@ } if (RC == &AMDGPU::SReg_64RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) + .addImm(1) + .addImm(0); + return; + } if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -4119,20 +4125,20 @@ case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; - case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; - case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; - case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; - case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; - case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; - case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; - case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; - case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; - case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; - case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; - case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; - case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; - case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; + case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; + case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; + case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; + case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; + case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; + case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; + case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; + case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -4523,13 +4529,13 @@ continue; } - if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && + if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && !isOperandLegal(MI, Idx, &MO)) { legalizeOpWithMove(MI, Idx); continue; } - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) continue; // VGPRs are legal // We can use one SGPR in each VOP3 instruction prior to GFX10 @@ -5165,7 +5171,7 @@ unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); - + Register CondReg = RI.getVCC(); // Handle some special cases switch (Opcode) { default: @@ -5284,18 +5290,18 @@ continue; case AMDGPU::S_CBRANCH_SCC0: - case AMDGPU::S_CBRANCH_SCC1: + case AMDGPU::S_CBRANCH_SCC1: { // Clear unused bits of vcc - if (ST.isWave32()) - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), - AMDGPU::VCC_LO) - .addReg(AMDGPU::EXEC_LO) - .addReg(AMDGPU::VCC_LO); - else - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), - AMDGPU::VCC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + Register CondReg = Inst.getOperand(1).getReg(); + bool IsSCC = CondReg == AMDGPU::SCC; + Register VCC = RI.getVCC(); + Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) + .addReg(EXEC) + .addReg(IsSCC ? VCC : CondReg); + Inst.RemoveOperand(1); + } break; case AMDGPU::S_BFE_U64: @@ -5397,6 +5403,33 @@ Inst.eraseFromParent(); } continue; + case AMDGPU::S_CMP_EQ_I32: + case AMDGPU::S_CMP_LG_I32: + case AMDGPU::S_CMP_GT_I32: + case AMDGPU::S_CMP_GE_I32: + case AMDGPU::S_CMP_LT_I32: + case AMDGPU::S_CMP_LE_I32: + case AMDGPU::S_CMP_EQ_U32: + case AMDGPU::S_CMP_LG_U32: + case AMDGPU::S_CMP_GT_U32: + case AMDGPU::S_CMP_GE_U32: + case AMDGPU::S_CMP_LT_U32: + case AMDGPU::S_CMP_LE_U32: + case AMDGPU::S_CMP_EQ_U64: + case AMDGPU::S_CMP_LG_U64: { + const MCInstrDesc &NewDesc = get(NewOpcode); + CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + legalizeOperands(*NewInstr, MDT); + int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); + MachineOperand SCCOp = Inst.getOperand(SCCIdx); + addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); + Inst.eraseFromParent(); + } + continue; } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -5418,7 +5451,7 @@ if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { // Only propagate through live-def of SCC. if (Op.isDef() && !Op.isDead()) - addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist, RI.getVCC()); Inst.RemoveOperand(i); } } @@ -5833,7 +5866,7 @@ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); const TargetRegisterClass *Src1RC = Src1.isReg() ? - MRI.getRegClass(Src1.getReg()) : + RI.getRegClassForReg(MRI, Src1.getReg()) : &AMDGPU::SGPR_32RegClass; const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); @@ -6117,7 +6150,8 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist) const { + SetVectorType &Worklist, + Register NewCond) const { // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst); @@ -6128,23 +6162,18 @@ make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), SCCDefInst.getParent()->end())) { // Check if SCC is used first. - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { + int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); + if (SCCIdx != -1) { if (MI.isCopy()) { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); unsigned DestReg = MI.getOperand(0).getReg(); - SmallVector Users; - for (auto &User : MRI.use_nodbg_instructions(DestReg)) { - if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || - (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { - Users.push_back(&User); - Worklist.insert(&User); - } - } - for (auto &U : Users) - U->getOperand(4).setReg(RI.getVCC()); + MRI.replaceRegWith(DestReg, NewCond); CopyToDelete.push_back(&MI); - } else + } else { + if (NewCond.isValid()) + MI.getOperand(SCCIdx).setReg(NewCond); Worklist.insert(&MI); + } } // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -571,15 +571,7 @@ def si_setcc_uniform : PatFrag < (ops node:$lhs, node:$rhs, node:$cond), (setcc node:$lhs, node:$rhs, node:$cond), [{ - for (SDNode *Use : N->uses()) { - if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg) - return false; - - unsigned Reg = cast(Use->getOperand(1))->getReg(); - if (Reg != AMDGPU::SCC) - return false; - } - return true; + return !N->isDivergent(); }]>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll --- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -57,8 +57,8 @@ } ; FUNC-LABEL: {{^}}null_32bit_lds_ptr: -; SI: v_cmp_ne_u32 -; SI-NOT: v_cmp_ne_u32 +; SI: s_cmp_lg_u32 +; SI: s_cselect_b64 vcc, 1, 0 ; SI: v_cndmask_b32 define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { %cmp = icmp ne i32 addrspace(3)* %lds, null diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -10,7 +10,8 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] -; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 +; CI-DAG: s_cselect_b64 vcc, 1, 0 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] @@ -22,7 +23,8 @@ ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base -; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; GFX9: s_cmp_lg_u32 [[PTR]], -1 +; GFX9: s_cselect_b64 vcc, 1, 0 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] @@ -76,7 +78,8 @@ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 +; CI-DAG: s_cselect_b64 vcc, 1, 0 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] @@ -89,7 +92,8 @@ ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; GFX9: s_cmp_lg_u32 [[PTR]], -1 +; GFX9: s_cselect_b64 vcc, 1, 0 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] @@ -148,7 +152,8 @@ ; HSA: enable_sgpr_queue_ptr = 0 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} -; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} +; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} +; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} @@ -165,7 +170,8 @@ ; HSA: enable_sgpr_queue_ptr = 0 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} -; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} +; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} +; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll --- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -18,7 +18,8 @@ ; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0 -; GCN-PROMOTE: v_cmp_eq_u32_e64 vcc, [[IN]], 1 +; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1 +; GCN-PROMOTE: s_cselect_b64 vcc, 1, 0 ; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc ; GCN: buffer_store_dword [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -42,7 +42,7 @@ ; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP28]], 1 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP28]] ; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]] -; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i32: @@ -121,7 +121,7 @@ ; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP30]], [[Y]] ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP30]] ; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]] -; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i32: @@ -209,7 +209,7 @@ ; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP39]], i32 [[TMP43]], i32 [[TMP42]] ; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP44]], [[TMP3]] ; CHECK-NEXT: [[TMP46:%.*]] = sub i32 [[TMP45]], [[TMP3]] -; CHECK-NEXT: store i32 [[TMP46]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[TMP46]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i32: @@ -221,17 +221,17 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s8, s3, 31 ; GCN-NEXT: s_add_i32 s3, s3, s8 -; GCN-NEXT: s_xor_b32 s9, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_xor_b32 s3, s3, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GCN-NEXT: s_ashr_i32 s9, s2, 31 +; GCN-NEXT: s_add_i32 s2, s2, s9 +; GCN-NEXT: s_xor_b32 s2, s2, s9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: s_xor_b32 s8, s9, s8 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s3 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] @@ -240,17 +240,17 @@ ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s2, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv i32 %x, %y @@ -305,7 +305,7 @@ ; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP38]], i32 [[TMP42]], i32 [[TMP41]] ; CHECK-NEXT: [[TMP44:%.*]] = xor i32 [[TMP43]], [[TMP1]] ; CHECK-NEXT: [[TMP45:%.*]] = sub i32 [[TMP44]], [[TMP1]] -; CHECK-NEXT: store i32 [[TMP45]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[TMP45]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i32: @@ -316,17 +316,17 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s5, 31 ; GCN-NEXT: s_add_i32 s3, s5, s2 -; GCN-NEXT: s_xor_b32 s10, s3, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GCN-NEXT: s_ashr_i32 s8, s4, 31 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: s_xor_b32 s9, s4, s8 +; GCN-NEXT: s_xor_b32 s8, s3, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_ashr_i32 s9, s4, 31 +; GCN-NEXT: s_add_i32 s4, s4, s9 +; GCN-NEXT: s_xor_b32 s10, s4, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s10 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s10 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] @@ -334,18 +334,18 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s9, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s10, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s10, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s10, v0 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v1 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v1 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[2:3] -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s9, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s9, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -373,7 +373,7 @@ ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 -; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i16: @@ -422,7 +422,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 -; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i16: @@ -475,7 +475,7 @@ ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 -; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i16: @@ -533,7 +533,7 @@ ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 -; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i16: @@ -587,7 +587,7 @@ ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i8: @@ -634,7 +634,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 -; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i8: @@ -686,7 +686,7 @@ ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 -; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i8: @@ -744,7 +744,7 @@ ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 -; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i8: @@ -942,7 +942,7 @@ ; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP150]] ; CHECK-NEXT: [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]] ; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3 -; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v4i32: @@ -1214,7 +1214,7 @@ ; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP152]] ; CHECK-NEXT: [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]] ; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3 -; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v4i32: @@ -1522,55 +1522,55 @@ ; CHECK-NEXT: [[TMP194:%.*]] = xor i32 [[TMP193]], [[TMP152]] ; CHECK-NEXT: [[TMP195:%.*]] = sub i32 [[TMP194]], [[TMP152]] ; CHECK-NEXT: [[TMP196:%.*]] = insertelement <4 x i32> [[TMP147]], i32 [[TMP195]], i64 3 -; CHECK-NEXT: store <4 x i32> [[TMP196]], <4 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i32> [[TMP196]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s20, 0x4f800000 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s16, 31 ; GCN-NEXT: s_add_i32 s3, s16, s2 -; GCN-NEXT: s_xor_b32 s5, s3, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: s_mov_b32 s16, 0x4f800000 +; GCN-NEXT: s_xor_b32 s3, s3, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_ashr_i32 s6, s17, 31 ; GCN-NEXT: s_add_i32 s0, s17, s6 +; GCN-NEXT: s_xor_b32 s7, s0, s6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s17, s0, s6 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17 -; GCN-NEXT: s_ashr_i32 s3, s12, 31 -; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GCN-NEXT: s_ashr_i32 s4, s12, 31 +; GCN-NEXT: s_add_i32 s5, s12, s4 +; GCN-NEXT: v_mul_f32_e32 v0, s20, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_add_i32 s4, s12, s3 -; GCN-NEXT: s_xor_b32 s4, s4, s3 -; GCN-NEXT: s_xor_b32 s7, s3, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 -; GCN-NEXT: s_ashr_i32 s12, s13, 31 -; GCN-NEXT: s_add_i32 s13, s13, s12 +; GCN-NEXT: s_xor_b32 s5, s5, s4 +; GCN-NEXT: s_xor_b32 s12, s4, s2 +; GCN-NEXT: s_ashr_i32 s16, s13, 31 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s3 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_xor_b32 s13, s13, s16 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v3 -; GCN-NEXT: s_xor_b32 s13, s13, s12 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 -; GCN-NEXT: v_mul_f32_e32 v1, s16, v2 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 +; GCN-NEXT: v_mul_f32_e32 v1, s20, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_mul_hi_u32 v5, v1, s17 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s17 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, s7 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s7 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s5, v2 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 @@ -1584,55 +1584,55 @@ ; GCN-NEXT: s_ashr_i32 s5, s18, 31 ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] ; GCN-NEXT: s_add_i32 s0, s18, s5 -; GCN-NEXT: s_xor_b32 s4, s12, s6 -; GCN-NEXT: s_xor_b32 s12, s0, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 +; GCN-NEXT: s_xor_b32 s4, s16, s6 +; GCN-NEXT: s_xor_b32 s6, s0, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 -; GCN-NEXT: v_xor_b32_e32 v0, s7, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s17 -; GCN-NEXT: s_ashr_i32 s6, s19, 31 -; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 +; GCN-NEXT: v_mul_lo_u32 v2, v1, s7 +; GCN-NEXT: v_mul_f32_e32 v4, s20, v4 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s13, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s13, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v2, v4, s12 -; GCN-NEXT: v_mul_hi_u32 v3, v4, s12 +; GCN-NEXT: v_mul_lo_u32 v2, v4, s6 +; GCN-NEXT: v_mul_hi_u32 v3, v4, s6 +; GCN-NEXT: s_ashr_i32 s7, s19, 31 ; GCN-NEXT: s_ashr_i32 s2, s14, 31 -; GCN-NEXT: s_add_i32 s3, s14, s2 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v2, v2, v4 +; GCN-NEXT: s_add_i32 s3, s14, s2 ; GCN-NEXT: s_xor_b32 s3, s3, s2 ; GCN-NEXT: v_xor_b32_e32 v1, s4, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v2, v4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: s_add_i32 s0, s19, s6 -; GCN-NEXT: s_xor_b32 s14, s0, s6 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s14 +; GCN-NEXT: s_add_i32 s0, s19, s7 +; GCN-NEXT: s_xor_b32 s12, s0, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 ; GCN-NEXT: v_mul_hi_u32 v2, v2, s3 -; GCN-NEXT: s_xor_b32 s7, s2, s5 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 +; GCN-NEXT: s_xor_b32 s13, s2, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GCN-NEXT: v_mul_lo_u32 v3, v2, s12 -; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 +; GCN-NEXT: v_mul_lo_u32 v3, v2, s6 +; GCN-NEXT: v_mul_f32_e32 v4, s20, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, s3, v3 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 -; GCN-NEXT: s_ashr_i32 s12, s15, 31 -; GCN-NEXT: v_mul_lo_u32 v6, v4, s14 -; GCN-NEXT: v_mul_hi_u32 v7, v4, s14 -; GCN-NEXT: s_add_i32 s13, s15, s12 -; GCN-NEXT: s_xor_b32 s13, s13, s12 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v5 +; GCN-NEXT: s_ashr_i32 s6, s15, 31 +; GCN-NEXT: v_mul_lo_u32 v6, v4, s12 +; GCN-NEXT: v_mul_hi_u32 v7, v4, s12 +; GCN-NEXT: s_add_i32 s14, s15, s6 +; GCN-NEXT: s_xor_b32 s14, s14, s6 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] @@ -1643,17 +1643,17 @@ ; GCN-NEXT: v_add_i32_e32 v7, vcc, v6, v4 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; GCN-NEXT: v_mul_hi_u32 v4, v4, s13 +; GCN-NEXT: v_mul_hi_u32 v4, v4, s14 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v3, v4, s14 -; GCN-NEXT: v_xor_b32_e32 v2, s7, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: s_xor_b32 s4, s12, s6 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, s13, v3 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s13, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v4, s12 +; GCN-NEXT: v_xor_b32_e32 v2, s13, v2 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s13, v2 +; GCN-NEXT: s_xor_b32 s4, s6, s7 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, s14, v3 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s14, v3 ; GCN-NEXT: v_add_i32_e32 v5, vcc, -1, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v4 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] @@ -1862,55 +1862,55 @@ ; CHECK-NEXT: [[TMP190:%.*]] = xor i32 [[TMP189]], [[TMP147]] ; CHECK-NEXT: [[TMP191:%.*]] = sub i32 [[TMP190]], [[TMP147]] ; CHECK-NEXT: [[TMP192:%.*]] = insertelement <4 x i32> [[TMP144]], i32 [[TMP191]], i64 3 -; CHECK-NEXT: store <4 x i32> [[TMP192]], <4 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i32> [[TMP192]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s20, 0x4f800000 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s16, 31 ; GCN-NEXT: s_add_i32 s3, s16, s2 -; GCN-NEXT: s_xor_b32 s5, s3, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: s_mov_b32 s16, 0x4f800000 +; GCN-NEXT: s_xor_b32 s4, s3, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_ashr_i32 s6, s12, 31 +; GCN-NEXT: s_add_i32 s0, s12, s6 ; GCN-NEXT: s_ashr_i32 s2, s17, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_add_i32 s0, s12, s6 ; GCN-NEXT: s_add_i32 s3, s17, s2 -; GCN-NEXT: s_xor_b32 s4, s0, s6 -; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 +; GCN-NEXT: s_xor_b32 s5, s0, s6 +; GCN-NEXT: s_xor_b32 s7, s3, s2 +; GCN-NEXT: v_mul_f32_e32 v0, s20, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s17, s3, s2 -; GCN-NEXT: s_ashr_i32 s7, s13, 31 -; GCN-NEXT: s_add_i32 s12, s13, s7 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 -; GCN-NEXT: s_xor_b32 s12, s12, s7 +; GCN-NEXT: s_ashr_i32 s12, s13, 31 +; GCN-NEXT: s_add_i32 s13, s13, s12 +; GCN-NEXT: s_xor_b32 s13, s13, s12 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s17 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 -; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 +; GCN-NEXT: v_mul_f32_e32 v1, s20, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s17 -; GCN-NEXT: v_mul_hi_u32 v5, v1, s17 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s7 +; GCN-NEXT: v_mul_hi_u32 v5, v1, s7 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s5, v0 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] @@ -1920,63 +1920,63 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: s_ashr_i32 s0, s18, 31 ; GCN-NEXT: s_add_i32 s1, s18, s0 -; GCN-NEXT: s_xor_b32 s13, s1, s0 +; GCN-NEXT: s_xor_b32 s16, s1, s0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s16 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] -; GCN-NEXT: v_mul_hi_u32 v1, v1, s12 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s17 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s7 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mul_f32_e32 v2, s16, v2 +; GCN-NEXT: v_mul_f32_e32 v2, s20, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s12, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 -; GCN-NEXT: v_mul_lo_u32 v5, v2, s13 -; GCN-NEXT: v_mul_hi_u32 v6, v2, s13 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s17, v3 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s17, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, s13, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s13, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3 +; GCN-NEXT: v_mul_lo_u32 v5, v2, s16 +; GCN-NEXT: v_mul_hi_u32 v6, v2, s16 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v3 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s7, v3 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v5, v5, v2 ; GCN-NEXT: s_ashr_i32 s6, s14, 31 -; GCN-NEXT: s_add_i32 s12, s14, s6 -; GCN-NEXT: s_xor_b32 s12, s12, s6 +; GCN-NEXT: s_add_i32 s7, s14, s6 +; GCN-NEXT: s_xor_b32 s7, s7, s6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v5, v2 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: s_ashr_i32 s0, s19, 31 ; GCN-NEXT: s_add_i32 s1, s19, s0 -; GCN-NEXT: s_xor_b32 s14, s1, s0 +; GCN-NEXT: s_xor_b32 s13, s1, s0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GCN-NEXT: v_mul_hi_u32 v2, v2, s12 +; GCN-NEXT: v_mul_hi_u32 v2, v2, s7 ; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_xor_b32_e32 v1, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s13 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 +; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 +; GCN-NEXT: v_mul_lo_u32 v2, v2, s16 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s12, v1 +; GCN-NEXT: v_mul_f32_e32 v3, s20, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v2 ; GCN-NEXT: s_ashr_i32 s7, s15, 31 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v2 -; GCN-NEXT: v_mul_lo_u32 v6, v3, s14 -; GCN-NEXT: v_mul_hi_u32 v7, v3, s14 +; GCN-NEXT: v_mul_lo_u32 v6, v3, s13 +; GCN-NEXT: v_mul_hi_u32 v7, v3, s13 ; GCN-NEXT: s_add_i32 s12, s15, s7 ; GCN-NEXT: s_xor_b32 s12, s12, s7 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v6, v6, v3 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s13, v4 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s13, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s16, v4 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s16, v4 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v6, v3 ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] @@ -1984,14 +1984,14 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v3, v3, s14 +; GCN-NEXT: v_mul_lo_u32 v3, v3, s13 ; GCN-NEXT: v_xor_b32_e32 v2, s6, v2 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s6, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v3 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v3 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s14, v4 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s13, v4 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s13, v4 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] @@ -2086,7 +2086,7 @@ ; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 ; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 -; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v4i16: @@ -2244,7 +2244,7 @@ ; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 ; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 ; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 -; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v4i16: @@ -2418,7 +2418,7 @@ ; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 ; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 -; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v4i16: @@ -2612,7 +2612,7 @@ ; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 ; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 ; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 -; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v4i16: @@ -2727,7 +2727,7 @@ ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 -; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i3: @@ -2777,7 +2777,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 -; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i3: @@ -2832,7 +2832,7 @@ ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 -; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i3: @@ -2891,7 +2891,7 @@ ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 -; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i3: @@ -2990,7 +2990,7 @@ ; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 ; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 -; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v3i16: @@ -3114,7 +3114,7 @@ ; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 ; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 -; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v3i16: @@ -3254,7 +3254,7 @@ ; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 ; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 -; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v3i16: @@ -3404,7 +3404,7 @@ ; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 ; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 -; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v3i16: @@ -3545,7 +3545,7 @@ ; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 ; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 -; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v3i15: @@ -3677,7 +3677,7 @@ ; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 ; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 -; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v3i15: @@ -3823,7 +3823,7 @@ ; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 ; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 -; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v3i15: @@ -3981,7 +3981,7 @@ ; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 ; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 -; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v3i15: @@ -4076,7 +4076,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @udiv_i32_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i32_oddk_denom: @@ -4102,7 +4102,7 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @udiv_i32_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i32_pow2k_denom: @@ -4125,7 +4125,7 @@ ; CHECK-LABEL: @udiv_i32_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i32_pow2_shl_denom: @@ -4154,7 +4154,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i32_pow2k_denom: @@ -4183,7 +4183,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: @@ -4291,7 +4291,7 @@ ; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP70]] ; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i32_pow2_shl_denom: @@ -4364,7 +4364,7 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @urem_i32_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i32_oddk_denom: @@ -4392,7 +4392,7 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @urem_i32_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i32_pow2k_denom: @@ -4415,7 +4415,7 @@ ; CHECK-LABEL: @urem_i32_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i32_pow2_shl_denom: @@ -4445,7 +4445,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i32_pow2k_denom: @@ -4550,7 +4550,7 @@ ; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP72]] ; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i32_pow2_shl_denom: @@ -4623,7 +4623,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @sdiv_i32_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i32_oddk_denom: @@ -4649,7 +4649,7 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @sdiv_i32_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i32_pow2k_denom: @@ -4675,7 +4675,7 @@ ; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i32_pow2_shl_denom: @@ -4688,17 +4688,17 @@ ; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GCN-NEXT: s_ashr_i32 s8, s3, 31 ; GCN-NEXT: s_add_i32 s3, s3, s8 -; GCN-NEXT: s_xor_b32 s9, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_xor_b32 s3, s3, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GCN-NEXT: s_ashr_i32 s9, s2, 31 +; GCN-NEXT: s_add_i32 s2, s2, s9 +; GCN-NEXT: s_xor_b32 s2, s2, s9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: s_xor_b32 s8, s9, s8 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s3 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] @@ -4707,17 +4707,17 @@ ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s2, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s2, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl i32 4096, %y @@ -4734,7 +4734,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i32_pow2k_denom: @@ -4769,7 +4769,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom: @@ -4898,7 +4898,7 @@ ; CHECK-NEXT: [[TMP96:%.*]] = xor i32 [[TMP95]], [[TMP54]] ; CHECK-NEXT: [[TMP97:%.*]] = sub i32 [[TMP96]], [[TMP54]] ; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> [[TMP49]], i32 [[TMP97]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP98]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP98]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i32_pow2_shl_denom: @@ -4913,48 +4913,48 @@ ; GCN-NEXT: s_lshl_b32 s2, s4, s2 ; GCN-NEXT: s_ashr_i32 s5, s2, 31 ; GCN-NEXT: s_add_i32 s2, s2, s5 -; GCN-NEXT: s_xor_b32 s13, s2, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GCN-NEXT: s_ashr_i32 s2, s6, 31 +; GCN-NEXT: s_xor_b32 s2, s2, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshl_b32 s0, s4, s3 -; GCN-NEXT: s_add_i32 s1, s6, s2 +; GCN-NEXT: s_ashr_i32 s3, s6, 31 +; GCN-NEXT: s_add_i32 s1, s6, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_ashr_i32 s6, s0, 31 -; GCN-NEXT: s_add_i32 s4, s0, s6 -; GCN-NEXT: s_xor_b32 s3, s1, s2 +; GCN-NEXT: s_add_i32 s10, s0, s6 +; GCN-NEXT: s_xor_b32 s4, s1, s3 ; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s15, s4, s6 -; GCN-NEXT: s_xor_b32 s12, s2, s5 +; GCN-NEXT: s_xor_b32 s12, s10, s6 +; GCN-NEXT: s_xor_b32 s13, s3, s5 ; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s13 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s2 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 ; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s15 -; GCN-NEXT: v_mul_hi_u32 v5, v1, s15 -; GCN-NEXT: s_ashr_i32 s13, s7, 31 -; GCN-NEXT: s_add_i32 s7, s7, s13 +; GCN-NEXT: s_ashr_i32 s14, s7, 31 +; GCN-NEXT: s_add_i32 s7, s7, s14 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s12 +; GCN-NEXT: v_mul_hi_u32 v5, v1, s12 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v2 +; GCN-NEXT: s_xor_b32 s7, s7, s14 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 -; GCN-NEXT: s_xor_b32 s7, s7, s13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 @@ -4963,12 +4963,12 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v2, v1, s15 -; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: s_xor_b32 s4, s13, s6 +; GCN-NEXT: v_mul_lo_u32 v2, v1, s12 +; GCN-NEXT: v_xor_b32_e32 v0, s13, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s13, v0 +; GCN-NEXT: s_xor_b32 s4, s14, s6 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v3 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 @@ -4988,7 +4988,7 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @srem_i32_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i32_oddk_denom: @@ -5016,7 +5016,7 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { ; CHECK-LABEL: @srem_i32_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i32_pow2k_denom: @@ -5043,7 +5043,7 @@ ; CHECK-LABEL: @srem_i32_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i32_pow2_shl_denom: @@ -5055,17 +5055,17 @@ ; GCN-NEXT: s_lshl_b32 s2, 0x1000, s5 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s10, s2, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GCN-NEXT: s_ashr_i32 s8, s4, 31 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: s_xor_b32 s9, s4, s8 +; GCN-NEXT: s_xor_b32 s8, s2, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_ashr_i32 s9, s4, 31 +; GCN-NEXT: s_add_i32 s4, s4, s9 +; GCN-NEXT: s_xor_b32 s10, s4, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s10 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s10 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] @@ -5073,18 +5073,18 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s9, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s9, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s10, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s10, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s10, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v1 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v1 ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s9, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s9, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -5102,7 +5102,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i32_pow2k_denom: @@ -5231,7 +5231,7 @@ ; CHECK-NEXT: [[TMP94:%.*]] = xor i32 [[TMP93]], [[TMP51]] ; CHECK-NEXT: [[TMP95:%.*]] = sub i32 [[TMP94]], [[TMP51]] ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <2 x i32> [[TMP48]], i32 [[TMP95]], i64 1 -; CHECK-NEXT: store <2 x i32> [[TMP96]], <2 x i32> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i32> [[TMP96]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i32_pow2_shl_denom: @@ -5240,54 +5240,54 @@ ; GCN-NEXT: s_movk_i32 s4, 0x1000 ; GCN-NEXT: s_mov_b32 s14, 0x4f800000 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s2, s4, s2 ; GCN-NEXT: s_ashr_i32 s5, s2, 31 ; GCN-NEXT: s_add_i32 s2, s2, s5 -; GCN-NEXT: s_xor_b32 s13, s2, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GCN-NEXT: s_xor_b32 s5, s2, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GCN-NEXT: s_lshl_b32 s2, s4, s3 ; GCN-NEXT: s_ashr_i32 s12, s6, 31 ; GCN-NEXT: s_add_i32 s3, s6, s12 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_ashr_i32 s4, s2, 31 -; GCN-NEXT: s_add_i32 s6, s2, s4 -; GCN-NEXT: s_xor_b32 s5, s3, s12 +; GCN-NEXT: s_add_i32 s8, s2, s4 +; GCN-NEXT: s_xor_b32 s6, s3, s12 ; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s15, s6, s4 -; GCN-NEXT: s_ashr_i32 s6, s7, 31 -; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s13 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s13 -; GCN-NEXT: s_xor_b32 s7, s7, s6 -; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_xor_b32 s13, s8, s4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s6 ; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s15 -; GCN-NEXT: v_mul_hi_u32 v5, v1, s15 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s5, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s13 +; GCN-NEXT: v_mul_hi_u32 v5, v1, s13 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s6, v0 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v2 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s13, v2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s13, v2 +; GCN-NEXT: s_ashr_i32 s6, s7, 31 +; GCN-NEXT: s_add_i32 s7, s7, s6 +; GCN-NEXT: s_xor_b32 s7, s7, s6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] @@ -5295,19 +5295,20 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v1, v1, s15 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 ; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s15, v2 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s15, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s13, v2 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s13, v2 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5319,7 +5320,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @udiv_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i64_oddk_denom: @@ -5353,7 +5354,7 @@ ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_movk_i32 s4, 0x11e +; GCN-NEXT: s_mov_b32 s4, 0x976a7376 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -5369,7 +5370,7 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s2, 0x976a7377 +; GCN-NEXT: s_movk_i32 s2, 0x11f ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -5377,7 +5378,7 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_movk_i32 s3, 0x11f +; GCN-NEXT: s_mov_b32 s3, 0x976a7377 ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc @@ -5407,24 +5408,24 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s3, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 -; GCN-NEXT: s_mov_b32 s2, 0x976a7376 +; GCN-NEXT: s_movk_i32 s3, 0x11e +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -5434,11 +5435,11 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s7 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] @@ -5454,7 +5455,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @udiv_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i64_pow2k_denom: @@ -5479,7 +5480,7 @@ ; CHECK-LABEL: @udiv_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_i64_pow2_shl_denom: @@ -5511,7 +5512,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_pow2k_denom: @@ -5542,7 +5543,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: @@ -5550,9 +5551,9 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s6, 0xf001 +; GCN-NEXT: s_movk_i32 s2, 0xf001 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -5561,96 +5562,96 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_movk_i32 s0, 0xfff -; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 +; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 +; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 +; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc +; GCN-NEXT: s_movk_i32 s0, 0xfff +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v1, s0 +; GCN-NEXT: v_mul_hi_u32 v5, v0, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GCN-NEXT: v_mul_lo_u32 v8, v0, s0 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 +; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 +; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -5672,7 +5673,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: udiv_v2i64_pow2_shl_denom: @@ -5702,7 +5703,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @urem_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i64_oddk_denom: @@ -5725,8 +5726,9 @@ ; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 ; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_movk_i32 s12, 0x11f -; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 +; GCN-NEXT: s_mov_b32 s12, 0x9761f7c9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 @@ -5735,13 +5737,12 @@ ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_movk_i32 s4, 0x11f ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_movk_i32 s5, 0x11e +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc @@ -5752,7 +5753,7 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_movk_i32 s5, 0x11e ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -5760,14 +5761,13 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 ; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc @@ -5791,25 +5791,26 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s12 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 +; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 +; GCN-NEXT: s_mov_b32 s6, 0x9761f7c8 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 +; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -5818,9 +5819,9 @@ ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -5836,7 +5837,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @urem_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i64_pow2k_denom: @@ -5861,7 +5862,7 @@ ; CHECK-LABEL: @urem_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_i64_pow2_shl_denom: @@ -5897,7 +5898,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i64_pow2k_denom: @@ -5932,7 +5933,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: urem_v2i64_pow2_shl_denom: @@ -5968,7 +5969,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @sdiv_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i64_oddk_denom: @@ -6055,33 +6056,33 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 -; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GCN-NEXT: v_mul_lo_u32 v8, v0, s3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s3, v8 +; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: s_mov_b32 s0, 0x12d8fa -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 @@ -6098,7 +6099,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @sdiv_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i64_pow2k_denom: @@ -6127,7 +6128,7 @@ ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_i64_pow2_shl_denom: @@ -6284,7 +6285,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i64_pow2k_denom: @@ -6323,7 +6324,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: @@ -6415,33 +6416,33 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v1, s9 +; GCN-NEXT: v_mul_hi_u32 v5, s9, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GCN-NEXT: v_mul_lo_u32 v8, v0, s9 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 +; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 @@ -6468,7 +6469,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: @@ -6749,7 +6750,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @srem_i64_oddk_denom( ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i64_oddk_denom: @@ -6877,7 +6878,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @srem_i64_pow2k_denom( ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i64_pow2k_denom: @@ -6908,7 +6909,7 @@ ; CHECK-LABEL: @srem_i64_pow2_shl_denom( ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] -; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_i64_pow2_shl_denom: @@ -7063,7 +7064,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i64_pow2k_denom: @@ -7110,7 +7111,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; ; GCN-LABEL: srem_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -310,12 +310,16 @@ ; GCN-LABEL: {{^}}expand_requires_expand: ; GCN-NEXT: ; %bb.0: ; %bb0 ; GCN: s_load_dword -; GCN: {{s|v}}_cmp_lt_i32 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lt_i32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GCN: s_cbranch ; GCN: s_load_dword ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64 +; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 + ; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]] ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: @@ -489,9 +493,11 @@ ; GCN: [[LONG_BR_DEST0]] ; GCN: s_cbranch_vccnz -; GCN-DAG: v_cmp_lt_i32 -; GCN-DAG: v_cmp_ge_i32 - +; GCN: s_cmp_lt_i32 [[SGPR1:s[0-9]+]], 1 +; GCN: s_cselect_b64 [[MASK1:s\[[0-9]+\:[0-9]+\]]], 1, 0 +; GCN: s_cmp_ge_i32 s{{[0-9]+}}, [[SGPR1]] +; GCN: s_cselect_b64 [[MASK2:s\[[0-9]+\:[0-9]+\]]], 1, 0 +; GCN: s_and_b64 s{{\[[0-9]+\:[0-9]+\]}}, [[MASK2]], [[MASK1]] ; GCN: s_cbranch_vccz ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll --- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -5,7 +5,8 @@ ; Produces error after adding an implicit def to v_cndmask_b32 ; GCN-LABEL: {{^}}vcc_shrink_vcc_def: -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { bb0: diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -10,7 +10,8 @@ ; GCN: s_branch ; GCN-DAG: v_cmp_lt_i32 -; GCN-DAG: v_cmp_gt_i32 +; GCN-DAG: s_cmp_gt_i32 +; GCN-DAG: s_cselect_b64 ; GCN: s_and_b64 ; GCN: s_mov_b64 exec diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -25,9 +25,10 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b32 s0, s2 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -40,8 +41,9 @@ ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b32 s1, s0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -379,17 +381,19 @@ ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b32 s0, s2 ; SI-NEXT: s_flbit_i32_b32 s1, s3 ; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 +; SI-NEXT: s_cmp_eq_u32 s3, 0 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_or_b32 s0, s2, s3 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -403,14 +407,16 @@ ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_flbit_i32_b32 s3, s1 ; VI-NEXT: s_add_i32 s2, s2, 32 +; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: s_flbit_i32_b32 s3, s1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -444,17 +450,19 @@ ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b32 s0, s2 ; SI-NEXT: s_flbit_i32_b32 s1, s3 ; SI-NEXT: s_add_i32 s0, s0, 32 -; SI-NEXT: s_or_b32 s2, s2, s3 +; SI-NEXT: s_cmp_eq_u32 s3, 0 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_or_b32 s0, s2, s3 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s2, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -467,14 +475,16 @@ ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b32 s2, s0 -; VI-NEXT: s_flbit_i32_b32 s3, s1 ; VI-NEXT: s_add_i32 s2, s2, 32 +; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: s_flbit_i32_b32 s3, s1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -99,7 +99,7 @@ ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64: ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}} +; GCN-DAG: s_cmp_eq_u32 s[[HI]], 0{{$}} ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -1,385 +1,783 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; GCN-LABEL: {{^}}float4_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]] -; GCN: store_dword v[{{[0-9:]+}}], [[V3]] define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: float4_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 4.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel store float %ext, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}int4_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], [[C3]] -; GCN: store_dword v[{{[0-9:]+}}], [[V3]] define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: int4_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <4 x i32> , i32 %sel store i32 %ext, i32 addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double4_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double4_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f847ae1 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3ff028f5 +; GCN-NEXT: v_mov_b32_e32 v2, 0xc28f5c29 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4000147a +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, 0x40100a3d +; GCN-NEXT: s_cselect_b64 s[2:3], 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, 0x47ae147b +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0xe147ae14 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, 0x70a3d70a +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double5_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double5_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 0x47ae147b +; GCN-NEXT: v_mov_b32_e32 v1, 0xc28f5c29 +; GCN-NEXT: v_mov_b32_e32 v2, 0x3ff028f5 +; GCN-NEXT: v_mov_b32_e32 v3, 0x70a3d70a +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0xe147ae14 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, 0x3f847ae1 +; GCN-NEXT: s_cmp_eq_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x4000147a +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40100a3d +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40140a3d +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}half4_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 -; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200 -; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4 -; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] -; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] -; GCN: store_short v[{{[0-9:]+}}], v[[VRL]] define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: half4_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_mov_b32 s3, 0x44004200 +; GCN-NEXT: s_mov_b32 s2, 0x40003c00 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <4 x half> , i32 %sel store half %ext, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}float2_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] -; GCN: store_dword v[{{[0-9:]+}}], [[V1]] define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: float2_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_cmp_eq_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <2 x float> , i32 %sel store float %ext, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double2_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double2_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f847ae1 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3ff028f5 +; GCN-NEXT: v_mov_b32_e32 v2, 0xc28f5c29 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, 0x47ae147b +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <2 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}half8_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 -; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 -; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 -; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] -; GCN: store_short v[{{[0-9:]+}}], [[V7]] define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: half8_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4200 +; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4400 +; GCN-NEXT: s_cmp_lg_u32 s0, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4500 +; GCN-NEXT: s_cmp_lg_u32 s0, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4600 +; GCN-NEXT: s_cmp_lg_u32 s0, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x4700 +; GCN-NEXT: s_cmp_lg_u32 s0, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x4800 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <8 x half> , i32 %sel store half %ext, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}short8_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 -; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 -; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 -; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] -; GCN: store_short v[{{[0-9:]+}}], [[V7]] define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: short8_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 2, s[0:1] +; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, 4, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, 6, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, 7, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 8, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <8 x i16> , i32 %sel store i16 %ext, i16 addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}float8_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 -; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 -; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 -; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] -; GCN: store_dword v[{{[0-9:]+}}], [[V7]] define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: float8_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, s[0:1] +; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GCN-NEXT: s_cmp_lg_u32 s4, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GCN-NEXT: s_cmp_lg_u32 s4, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GCN-NEXT: s_cmp_lg_u32 s4, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <8 x float> , i32 %sel store float %ext, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double8_extelt: -; GCN-NOT: buffer_ -; GCN-NOT: s_or_b32 -; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double8_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s18, s[0:1], 0x2c +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s15, 0x40200000 +; GCN-NEXT: s_mov_b32 s13, 0x401c0000 +; GCN-NEXT: s_mov_b32 s11, 0x40180000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s18, s18, 1 +; GCN-NEXT: s_mov_b32 s9, 0x40140000 +; GCN-NEXT: s_mov_b32 s7, 0x40100000 +; GCN-NEXT: s_mov_b32 s5, 0x40080000 +; GCN-NEXT: s_mov_b32 s3, 2.0 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s10, s0 +; GCN-NEXT: s_mov_b32 s12, s0 +; GCN-NEXT: s_mov_b32 s14, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NEXT: v_movrels_b32_e32 v16, v1 +; GCN-NEXT: v_movrels_b32_e32 v15, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <8 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double7_extelt: -; GCN-NOT: buffer_ -; GCN-NOT: s_or_b32 -; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double7_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s16, s[0:1], 0x2c +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s13, 0x401c0000 +; GCN-NEXT: s_mov_b32 s11, 0x40180000 +; GCN-NEXT: s_mov_b32 s9, 0x40140000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s16, s16, 1 +; GCN-NEXT: s_mov_b32 s7, 0x40100000 +; GCN-NEXT: s_mov_b32 s5, 0x40080000 +; GCN-NEXT: s_mov_b32 s3, 2.0 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s6, s0 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s10, s0 +; GCN-NEXT: s_mov_b32 s12, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: s_mov_b32 m0, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NEXT: v_movrels_b32_e32 v16, v1 +; GCN-NEXT: v_movrels_b32_e32 v15, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <7 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}float16_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 m0, -; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000 -; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]] -; GCN: store_dword v[{{[0-9:]+}}], [[RES]] define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: float16_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GCN-NEXT: v_mov_b32_e32 v3, 4.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GCN-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GCN-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GCN-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GCN-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GCN-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GCN-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GCN-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GCN-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GCN-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GCN-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GCN-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GCN-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <16 x float> , i32 %sel store float %ext, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double15_extelt: -; GCN-NOT: buffer_ -; GCN-NOT: s_or_b32 -; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double15_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: s_mov_b32 s65, 0x402e0000 +; GCN-NEXT: s_mov_b32 s63, 0x402c0000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_mov_b32 s61, 0x402a0000 +; GCN-NEXT: s_mov_b32 s59, 0x40280000 +; GCN-NEXT: s_mov_b32 s57, 0x40260000 +; GCN-NEXT: s_mov_b32 s55, 0x40240000 +; GCN-NEXT: s_mov_b32 s53, 0x40220000 +; GCN-NEXT: s_mov_b32 s51, 0x40200000 +; GCN-NEXT: s_mov_b32 s49, 0x401c0000 +; GCN-NEXT: s_mov_b32 s47, 0x40180000 +; GCN-NEXT: s_mov_b32 s45, 0x40140000 +; GCN-NEXT: s_mov_b32 s43, 0x40100000 +; GCN-NEXT: s_mov_b32 s41, 0x40080000 +; GCN-NEXT: s_mov_b32 s39, 2.0 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v31, s67 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NEXT: v_mov_b32_e32 v12, s48 +; GCN-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NEXT: v_mov_b32_e32 v16, s52 +; GCN-NEXT: v_mov_b32_e32 v17, s53 +; GCN-NEXT: v_mov_b32_e32 v18, s54 +; GCN-NEXT: v_mov_b32_e32 v19, s55 +; GCN-NEXT: v_mov_b32_e32 v20, s56 +; GCN-NEXT: v_mov_b32_e32 v21, s57 +; GCN-NEXT: v_mov_b32_e32 v22, s58 +; GCN-NEXT: v_mov_b32_e32 v23, s59 +; GCN-NEXT: v_mov_b32_e32 v24, s60 +; GCN-NEXT: v_mov_b32_e32 v25, s61 +; GCN-NEXT: v_mov_b32_e32 v26, s62 +; GCN-NEXT: v_mov_b32_e32 v27, s63 +; GCN-NEXT: v_mov_b32_e32 v28, s64 +; GCN-NEXT: v_mov_b32_e32 v29, s65 +; GCN-NEXT: v_mov_b32_e32 v30, s66 +; GCN-NEXT: v_movrels_b32_e32 v32, v1 +; GCN-NEXT: v_movrels_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <15 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}double16_extelt: -; GCN-NOT: buffer_ -; GCN-NOT: s_or_b32 -; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] -; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] -; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: double16_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: s_mov_b32 s67, 0x40300000 +; GCN-NEXT: s_mov_b32 s65, 0x402e0000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_mov_b32 s63, 0x402c0000 +; GCN-NEXT: s_mov_b32 s61, 0x402a0000 +; GCN-NEXT: s_mov_b32 s59, 0x40280000 +; GCN-NEXT: s_mov_b32 s57, 0x40260000 +; GCN-NEXT: s_mov_b32 s55, 0x40240000 +; GCN-NEXT: s_mov_b32 s53, 0x40220000 +; GCN-NEXT: s_mov_b32 s51, 0x40200000 +; GCN-NEXT: s_mov_b32 s49, 0x401c0000 +; GCN-NEXT: s_mov_b32 s47, 0x40180000 +; GCN-NEXT: s_mov_b32 s45, 0x40140000 +; GCN-NEXT: s_mov_b32 s43, 0x40100000 +; GCN-NEXT: s_mov_b32 s41, 0x40080000 +; GCN-NEXT: s_mov_b32 s39, 2.0 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: s_mov_b32 s66, s36 +; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v31, s67 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NEXT: v_mov_b32_e32 v12, s48 +; GCN-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NEXT: v_mov_b32_e32 v16, s52 +; GCN-NEXT: v_mov_b32_e32 v17, s53 +; GCN-NEXT: v_mov_b32_e32 v18, s54 +; GCN-NEXT: v_mov_b32_e32 v19, s55 +; GCN-NEXT: v_mov_b32_e32 v20, s56 +; GCN-NEXT: v_mov_b32_e32 v21, s57 +; GCN-NEXT: v_mov_b32_e32 v22, s58 +; GCN-NEXT: v_mov_b32_e32 v23, s59 +; GCN-NEXT: v_mov_b32_e32 v24, s60 +; GCN-NEXT: v_mov_b32_e32 v25, s61 +; GCN-NEXT: v_mov_b32_e32 v26, s62 +; GCN-NEXT: v_mov_b32_e32 v27, s63 +; GCN-NEXT: v_mov_b32_e32 v28, s64 +; GCN-NEXT: v_mov_b32_e32 v29, s65 +; GCN-NEXT: v_mov_b32_e32 v30, s66 +; GCN-NEXT: v_movrels_b32_e32 v32, v1 +; GCN-NEXT: v_movrels_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] +; GCN-NEXT: s_endpgm entry: %ext = extractelement <16 x double> , i32 %sel store double %ext, double addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}float32_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 m0, -; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41880000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41980000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f00000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f80000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000 -; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]] -; GCN: store_dword v[{{[0-9:]+}}], [[RES]] define amdgpu_kernel void @float32_extelt(float addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: float32_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: v_mov_b32_e32 v3, 4.0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GCN-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GCN-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GCN-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GCN-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GCN-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GCN-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GCN-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GCN-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GCN-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GCN-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GCN-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GCN-NEXT: v_mov_b32_e32 v16, 0x41880000 +; GCN-NEXT: v_mov_b32_e32 v17, 0x41900000 +; GCN-NEXT: v_mov_b32_e32 v18, 0x41980000 +; GCN-NEXT: v_mov_b32_e32 v19, 0x41a00000 +; GCN-NEXT: v_mov_b32_e32 v20, 0x41a80000 +; GCN-NEXT: v_mov_b32_e32 v21, 0x41b00000 +; GCN-NEXT: v_mov_b32_e32 v22, 0x41b80000 +; GCN-NEXT: v_mov_b32_e32 v23, 0x41c00000 +; GCN-NEXT: v_mov_b32_e32 v24, 0x41c80000 +; GCN-NEXT: v_mov_b32_e32 v25, 0x41d00000 +; GCN-NEXT: v_mov_b32_e32 v26, 0x41d80000 +; GCN-NEXT: v_mov_b32_e32 v27, 0x41e00000 +; GCN-NEXT: v_mov_b32_e32 v28, 0x41e80000 +; GCN-NEXT: v_mov_b32_e32 v29, 0x41f00000 +; GCN-NEXT: v_mov_b32_e32 v30, 0x41f80000 +; GCN-NEXT: v_mov_b32_e32 v31, 0x42000000 +; GCN-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <32 x float> , i32 %sel store float %ext, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}byte8_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201 -; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605 -; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3 -; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] -; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] -; GCN: store_byte v[{{[0-9:]+}}], v[[VRL]] define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: byte8_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_mov_b32 s3, 0x8070605 +; GCN-NEXT: s_mov_b32 s2, 0x4030201 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 3 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <8 x i8> , i32 %sel store i8 %ext, i8 addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}byte16_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 -; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 -; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 -; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 -; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 -; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8 -; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9 -; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10 -; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11 -; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12 -; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13 -; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14 -; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]] -; GCN: store_byte v[{{[0-9:]+}}], [[V15]] define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: byte16_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 2, s[0:1] +; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, 4, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, 6, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, 7, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 8 +; GCN-NEXT: v_cndmask_b32_e32 v0, 8, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 9 +; GCN-NEXT: v_cndmask_b32_e32 v0, 9, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 10 +; GCN-NEXT: v_cndmask_b32_e32 v0, 10, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 11 +; GCN-NEXT: v_cndmask_b32_e32 v0, 11, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 12 +; GCN-NEXT: v_cndmask_b32_e32 v0, 12, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 13 +; GCN-NEXT: v_cndmask_b32_e32 v0, 13, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 14 +; GCN-NEXT: v_cndmask_b32_e32 v0, 14, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 15 +; GCN-NEXT: v_cndmask_b32_e32 v0, 15, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 16, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <16 x i8> , i32 %sel store i8 %ext, i8 addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}bit4_extelt: -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN-DAG: buffer_store_byte [[ZERO]], -; GCN-DAG: buffer_store_byte [[ONE]], -; GCN-DAG: buffer_store_byte [[ZERO]], -; GCN-DAG: buffer_store_byte [[ONE]], -; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]], -; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]] -; GCN: flat_store_dword v[{{[0-9:]+}}], [[RES]] define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: bit4_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe80000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s0, s0, 3 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:7 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:6 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:5 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_load_ubyte v0, v0, s[4:7], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <4 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -387,15 +785,398 @@ ret void } -; GCN-LABEL: {{^}}bit128_extelt: -; GCN-NOT: buffer_ -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1 -; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f -; GCN-DAG: v_cmp_ne_u32_e32 [[CL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]] -; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]] -; GCN: store_dword v[{{[0-9:]+}}], [[RES]] define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) { +; GCN-LABEL: bit128_extelt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cmp_lg_u32 s0, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 8 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 9 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 10 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 11 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 12 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 13 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 14 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 15 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 16 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 17 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 18 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 19 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 20 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 21 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 22 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 23 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 24 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 25 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 26 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 27 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 28 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 29 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 30 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 31 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 32 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 33 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 34 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 35 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 36 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 37 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 38 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 39 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 40 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 41 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 42 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 43 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 44 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 45 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 46 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 47 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 48 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 49 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 50 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 51 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 52 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 53 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 54 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 55 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 56 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 57 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 58 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 59 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 60 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 61 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 62 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 63 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 64 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f +; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_and_b32_e32 v2, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm entry: %ext = extractelement <128 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -403,29 +1184,177 @@ ret void } -; GCN-LABEL: {{^}}float32_extelt_vec: -; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1.0, 2.0, [[CC1]] -; GCN-DAG: v_mov_b32_e32 [[LASTVAL:v[0-9]+]], 0x42000000 -; GCN-DAG: v_cmp_ne_u32_e32 [[LASTCC:[^,]+]], 31, v0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v0, [[LASTVAL]], v{{[0-9]+}}, [[LASTCC]] define float @float32_extelt_vec(i32 %sel) { +; GCN-LABEL: float32_extelt_vec: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40e00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41300000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41400000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41880000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41900000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 17, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41980000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 18, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41a00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 19, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41a80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 20, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41b00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 21, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41b80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 22, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41c00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 23, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41c80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 24, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41d00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 25, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41d80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41e00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 27, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41e80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 28, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41f00000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 29, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41f80000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 30, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 31, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext } -; GCN-LABEL: {{^}}double16_extelt_vec: -; GCN-NOT: buffer_ -; GCN-DAG: v_mov_b32_e32 [[V1HI:v[0-9]+]], 0x3ff19999 -; GCN-DAG: v_mov_b32_e32 [[V1LO:v[0-9]+]], 0x9999999a -; GCN-DAG: v_mov_b32_e32 [[V2HI:v[0-9]+]], 0x4000cccc -; GCN-DAG: v_mov_b32_e32 [[V2LO:v[0-9]+]], 0xcccccccd -; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1HI:v[0-9]+]], [[V1HI]], [[V2HI]], [[CC1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1LO:v[0-9]+]], [[V1LO]], [[V2LO]], [[CC1]] define double @double16_extelt_vec(i32 %sel) { +; GCN-LABEL: double16_extelt_vec: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff19999 +; GCN-NEXT: v_mov_b32_e32 v4, 0x4000cccc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x4008cccc +; GCN-NEXT: v_mov_b32_e32 v1, 0x9999999a +; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd +; GCN-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x40106666 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x40146666 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40186666 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v5, 0x401c6666 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, 0x66666666 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x40203333 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x40223333 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 9, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40243333 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 10, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40263333 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 11, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x40283333 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 12, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x402a3333 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 13, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0x402c3333 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v5, 0x402e3333 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v4, 0x33333333 +; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999 +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -14,8 +14,10 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 +; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] @@ -29,9 +31,12 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 +; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 +; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -31,7 +31,8 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN: store_dwordx2 v[{{[0-9:]+}}] @@ -44,7 +45,8 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2: ; GCN: buffer_load_dwordx4 ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN: store_dwordx2 v[{{[0-9:]+}}] @@ -58,8 +60,10 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 +; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] @@ -73,9 +77,12 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: ; GCN-NOT: buffer_load -; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 -; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 -; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 +; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], 1, 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 +; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -147,23 +147,26 @@ ; GCN-LABEL: no_extract_volatile_load_dynextract: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s10, s2 ; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s8, s6 ; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: s_cmp_eq_u32 s12, 1 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s12, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s12, 3 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,11 +18,12 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s2 ; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_and_b32 s1, s2, 31 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_cmp_eq_u32 s1, 0 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -35,10 +36,11 @@ ; VI-NEXT: s_sub_i32 s3, 32, s2 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_and_b32 s1, s2, 31 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -53,10 +55,11 @@ ; GFX9-NEXT: s_sub_i32 s3, 32, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -150,19 +153,21 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_sub_i32 s10, 32, s1 -; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: s_cmp_eq_u32 s1, 0 ; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_sub_i32 s1, 32, s0 -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: s_and_b32 s0, s0, 31 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -176,19 +181,21 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_sub_i32 s8, 32, s1 -; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: s_cmp_eq_u32 s1, 0 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_sub_i32 s1, 32, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -204,19 +211,21 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_sub_i32 s8, 32, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_sub_i32 s1, 32, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -327,35 +336,39 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s15 ; SI-NEXT: s_sub_i32 s16, 32, s3 -; SI-NEXT: v_mov_b32_e32 v1, s16 ; SI-NEXT: s_and_b32 s3, s3, 31 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_cmp_eq_u32 s3, 0 ; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: s_sub_i32 s3, 32, s2 -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, s14 ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: s_sub_i32 s2, 32, s1 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: s_cmp_eq_u32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_sub_i32 s1, 32, s0 -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: s_and_b32 s0, s0, 31 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -369,35 +382,39 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_sub_i32 s14, 32, s3 -; VI-NEXT: v_mov_b32_e32 v1, s14 ; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: s_cmp_eq_u32 s3, 0 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_sub_i32 s3, 32, s2 -; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_sub_i32 s2, 32, s1 -; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; VI-NEXT: s_cmp_eq_u32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_sub_i32 s1, 32, s0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 @@ -413,35 +430,39 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: s_sub_i32 s14, 32, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_cmp_eq_u32 s3, 0 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_sub_i32 s3, 32, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_sub_i32 s2, 32, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_sub_i32 s1, 32, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-NEXT: v_mov_b32_e32 v5, s13 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -140,14 +140,16 @@ ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_and_b32 s1, s1, 31 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_cmp_eq_u32 s1, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_and_b32 s0, s0, 31 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -162,14 +164,16 @@ ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_and_b32 s1, s1, 31 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_cmp_eq_u32 s1, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_and_b32 s0, s0, 31 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -186,14 +190,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_and_b32 s1, s1, 31 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -303,26 +309,30 @@ ; SI-NEXT: v_mov_b32_e32 v0, s15 ; SI-NEXT: s_and_b32 s3, s3, 31 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; SI-NEXT: s_cmp_eq_u32 s3, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, s14 ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; SI-NEXT: s_cmp_eq_u32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_and_b32 s0, s0, 31 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -337,26 +347,30 @@ ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_and_b32 s3, s3, 31 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; VI-NEXT: s_cmp_eq_u32 s3, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; VI-NEXT: s_cmp_eq_u32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_and_b32 s0, s0, 31 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 @@ -373,26 +387,30 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: s_and_b32 s3, s3, 31 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 31 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-NEXT: v_mov_b32_e32 v5, s13 diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -4,51 +4,53 @@ define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { ; SI-LABEL: i1_copy_from_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s8, 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 +; SI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; SI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; SI-NEXT: s_branch BB0_3 -; SI-NEXT: BB0_1: ; %Flow1 -; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: BB0_1: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: BB0_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_and_b64 s[14:15], exec, s[10:11] ; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec -; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; SI-NEXT: s_cbranch_execz BB0_6 +; SI-NEXT: s_cbranch_execz BB0_7 ; SI-NEXT: BB0_3: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_cmp_lt_u32 s8, 4 +; SI-NEXT: s_cselect_b64 s[12:13], 1, 0 ; SI-NEXT: s_or_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cmp_gt_u32 s6, 3 -; SI-NEXT: v_cmp_lt_u32_e64 s[12:13], s6, 4 -; SI-NEXT: s_cbranch_scc1 BB0_2 +; SI-NEXT: s_cmp_gt_u32 s8, 3 +; SI-NEXT: s_cbranch_scc1 BB0_1 ; SI-NEXT: ; %bb.4: ; %mid.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen ; SI-NEXT: s_mov_b64 s[12:13], -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_cbranch_execz BB0_1 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_add_i32 s6, s6, 1 +; SI-NEXT: s_add_i32 s8, s8, 1 ; SI-NEXT: s_xor_b64 s[10:11], exec, -1 -; SI-NEXT: s_branch BB0_1 -; SI-NEXT: BB0_6: ; %for.end +; SI-NEXT: ; %bb.6: ; %Flow1 +; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_branch BB0_2 +; SI-NEXT: BB0_7: ; %for.end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[8:9] -; SI-NEXT: s_cbranch_execz BB0_8 -; SI-NEXT: ; %bb.7: ; %if +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] +; SI-NEXT: s_cbranch_execz BB0_9 +; SI-NEXT: ; %bb.8: ; %if ; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm -; SI-NEXT: BB0_8: ; %end +; SI-NEXT: BB0_9: ; %end ; SI-NEXT: s_endpgm entry: br label %for.body diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll --- a/llvm/test/CodeGen/AMDGPU/icmp64.ll +++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll @@ -1,8 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -; SI-LABEL: {{^}}test_i64_eq: +; GCN-LABEL: {{^}}test_i64_eq: ; SI: v_cmp_eq_u64 +; VI: s_cmp_eq_u64 define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp eq i64 %a, %b %result = sext i1 %cmp to i32 @@ -12,6 +13,7 @@ ; SI-LABEL: {{^}}test_i64_ne: ; SI: v_cmp_ne_u64 +; VI: s_cmp_lg_u64 define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ne i64 %a, %b %result = sext i1 %cmp to i32 @@ -19,8 +21,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_slt: -; SI: v_cmp_lt_i64 +; GCN-LABEL: {{^}}test_i64_slt: +; GCN: v_cmp_lt_i64 define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp slt i64 %a, %b %result = sext i1 %cmp to i32 @@ -28,8 +30,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_ult: -; SI: v_cmp_lt_u64 +; GCN-LABEL: {{^}}test_i64_ult: +; GCN: v_cmp_lt_u64 define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ult i64 %a, %b %result = sext i1 %cmp to i32 @@ -37,8 +39,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_sle: -; SI: v_cmp_le_i64 +; GCN-LABEL: {{^}}test_i64_sle: +; GCN: v_cmp_le_i64 define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sle i64 %a, %b %result = sext i1 %cmp to i32 @@ -46,8 +48,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_ule: -; SI: v_cmp_le_u64 +; GCN-LABEL: {{^}}test_i64_ule: +; GCN: v_cmp_le_u64 define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ule i64 %a, %b %result = sext i1 %cmp to i32 @@ -55,8 +57,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_sgt: -; SI: v_cmp_gt_i64 +; GCN-LABEL: {{^}}test_i64_sgt: +; GCN: v_cmp_gt_i64 define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sgt i64 %a, %b %result = sext i1 %cmp to i32 @@ -64,8 +66,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_ugt: -; SI: v_cmp_gt_u64 +; GCN-LABEL: {{^}}test_i64_ugt: +; GCN: v_cmp_gt_u64 define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i64 %a, %b %result = sext i1 %cmp to i32 @@ -73,8 +75,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_sge: -; SI: v_cmp_ge_i64 +; GCN-LABEL: {{^}}test_i64_sge: +; GCN: v_cmp_ge_i64 define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sge i64 %a, %b %result = sext i1 %cmp to i32 @@ -82,8 +84,8 @@ ret void } -; SI-LABEL: {{^}}test_i64_uge: -; SI: v_cmp_ge_u64 +; GCN-LABEL: {{^}}test_i64_uge: +; GCN: v_cmp_ge_u64 define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp uge i64 %a, %b %result = sext i1 %cmp to i32 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -3,13 +3,17 @@ ; GCN-LABEL: {{^}}float4_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2 +; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 0 +; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] ; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) { @@ -38,13 +42,17 @@ ; GCN-LABEL: {{^}}int4_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2 +; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 0 +; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]] ; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) { @@ -57,9 +65,11 @@ ; GCN-LABEL: {{^}}float2_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 0 +; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]] ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) { @@ -72,21 +82,29 @@ ; GCN-LABEL: {{^}}float8_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2 +; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 0 +; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 7 +; GCN-DAG: s_cselect_b64 [[CC5:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 6 +; GCN-DAG: s_cselect_b64 [[CC6:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 5 +; GCN-DAG: s_cselect_b64 [[CC7:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]] -; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4 +; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 4 +; GCN-DAG: s_cselect_b64 [[CC8:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]] ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]] ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]] @@ -149,14 +167,22 @@ ; GCN-LABEL: {{^}}half8_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 1 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 2 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 3 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 4 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 5 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 6 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 7 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 @@ -228,8 +254,10 @@ ; GCN-LABEL: {{^}}byte16_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 -; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 +; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 15 +; GCN-DAG: s_cselect_b64 {{[^,]+}}, 1, 0 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 @@ -264,10 +292,12 @@ ; GCN-LABEL: {{^}}double2_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] -; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: s_cmp_eq_u32 [[IDX]], 0 +; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) { @@ -363,10 +393,12 @@ ; GCN-LABEL: {{^}}bit128_inselt: ; GCN-NOT: buffer_ -; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f -; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] + +; GCN-DAG: s_cmpk_lg_i32 s{{[0-9]+}}, 0x7f +; GCN-DAG: s_cselect_b64 [[CCL:[^,]+]], 1, 0 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]] define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -290,10 +290,12 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -308,10 +310,12 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -331,13 +335,16 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -352,13 +359,16 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -378,16 +388,20 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -402,16 +416,20 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -431,28 +449,36 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 +; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v8, s12 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -468,28 +494,36 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; VI-NEXT: s_cmp_lg_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s14 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 +; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v8, s12 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -580,10 +614,12 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -597,10 +633,12 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -619,13 +657,16 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -639,13 +680,16 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -665,17 +709,21 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -689,18 +737,22 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s6, 3 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 +; VI-NEXT: s_cmp_eq_u32 s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 +; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -719,28 +771,36 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s15 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s14 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 +; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -755,28 +815,36 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s15 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; VI-NEXT: s_cmp_lg_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s14 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 +; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1127,96 +1195,112 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s11, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 15 ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_lshr_b32 s5, s11, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 14 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_lshr_b32 s6, s11, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_movk_i32 s5, 0xff +; SI-NEXT: s_cmp_lg_u32 s4, 13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_lshr_b32 s6, s11, 8 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 12 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v2, s5, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_mov_b32 s6, 0xffff +; SI-NEXT: s_lshr_b32 s7, s10, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_lshr_b32 s7, s10, 24 ; SI-NEXT: v_or_b32_e32 v3, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_lshr_b32 s7, s10, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 10 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-NEXT: s_lshr_b32 s7, s10, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_lshr_b32 s7, s10, 8 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 +; SI-NEXT: s_cmp_lg_u32 s4, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v2, s5, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_lshr_b32 s7, s9, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_lshr_b32 s7, s9, 24 ; SI-NEXT: v_or_b32_e32 v2, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-NEXT: s_lshr_b32 s7, s9, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: s_lshr_b32 s7, s9, 8 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v4, s5, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: s_lshr_b32 s7, s8, 24 +; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 -; SI-NEXT: s_lshr_b32 s7, s8, 24 ; SI-NEXT: v_or_b32_e32 v1, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; SI-NEXT: s_lshr_b32 s7, s8, 8 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v4, s5, v4 -; SI-NEXT: s_lshr_b32 s7, s8, 8 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_and_b32_e32 v5, s5, v5 @@ -1236,81 +1320,97 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s11, 24 +; VI-NEXT: s_cmp_lg_u32 s4, 15 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s11, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s11, 8 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cmp_lg_u32 s4, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 +; VI-NEXT: s_cmp_lg_u32 s4, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s11 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 -; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; VI-NEXT: s_lshr_b32 s5, s10, 24 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s10, 24 +; VI-NEXT: s_cmp_lg_u32 s4, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s10, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s10, 8 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cmp_lg_u32 s4, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 +; VI-NEXT: s_cmp_lg_u32 s4, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 -; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; VI-NEXT: s_lshr_b32 s5, s9, 24 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s9, 24 +; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s9, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s9, 8 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 +; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 -; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; VI-NEXT: s_lshr_b32 s5, s8, 24 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s8, 24 +; VI-NEXT: s_cmp_lg_u32 s4, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s8, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 -; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: s_lshr_b32 s5, s8, 8 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 -; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1416,12 +1516,14 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cmp_eq_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -1438,12 +1540,14 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cmp_eq_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -1457,19 +1561,21 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s6, s[4:5], 0x8 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s6, 1 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] @@ -1478,19 +1584,21 @@ ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s6, 1 +; VI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; VI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] @@ -1504,24 +1612,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s6, 2 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 ; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] @@ -1531,24 +1642,27 @@ ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 ; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] +; VI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; VI-NEXT: s_cselect_b64 s[4:5], 1, 0 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] @@ -1571,22 +1685,26 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cmp_eq_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_cmp_eq_u32 s4, 3 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: s_cmp_eq_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc ; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc @@ -1604,22 +1722,26 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cmp_eq_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_cmp_eq_u32 s4, 3 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc ; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 +; VI-NEXT: s_cselect_b64 vcc, 1, 0 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -86,7 +86,7 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { %cmp = icmp eq i32 %i, 0 @@ -119,7 +119,8 @@ ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} -; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} +; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}} +; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1, 0 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -298,8 +298,10 @@ } ; GCN-LABEL: {{^}}v_icmp_i1_ne0: -; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]], -; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]], +; GCN: s_cmp_gt_u32 s{{[0-9]+}}, 1 +; GCN: s_cselect_b64 s[[C0:\[[0-9]+:[0-9]+\]]], +; GCN: s_cmp_gt_u32 s{{[0-9]+}}, 2 +; GCN: s_cselect_b64 s[[C1:\[[0-9]+:[0-9]+\]]], ; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]] ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 ; GCN-NEXT: v_mov_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -29,8 +29,10 @@ ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]] -; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]] +; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_andn2_b64 vcc, exec, [[MASK]] + ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_private_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -30,8 +30,9 @@ ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} -; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]] -; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]] +; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_andn2_b64 vcc, exec, [[MASK]] ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_local_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -260,7 +260,7 @@ } ; GCN-LABEL: {{^}}test_scc_liveness: -; GCN: v_cmp +; GCN: s_cmp ; GCN: s_and_b64 exec ; GCN: s_cmp ; GCN: s_cbranch_scc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -185,9 +185,10 @@ ; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_add_i32 s1, s8, s7 ; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_mul_i32 s2, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] @@ -218,10 +219,11 @@ ; SI-NEXT: v_mul_hi_u32 v1, s0, v1 ; SI-NEXT: v_mul_hi_i32 v3, s1, v3 ; SI-NEXT: s_mul_i32 s6, s1, s3 -; SI-NEXT: s_mul_i32 s8, s0, s2 +; SI-NEXT: s_cmp_lt_i32 s1, 0 +; SI-NEXT: s_mul_i32 s1, s0, s2 ; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v1 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v6, s1 ; SI-NEXT: v_add_i32_e32 v5, vcc, s4, v5 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -231,14 +233,15 @@ ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v1 ; SI-NEXT: v_subrev_i32_e32 v1, vcc, s2, v2 ; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: v_subrev_i32_e32 v5, vcc, s0, v2 ; SI-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v3, vcc -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1] @@ -268,7 +271,8 @@ ; GFX9-NEXT: s_addc_u32 s6, 0, s6 ; GFX9-NEXT: s_sub_u32 s9, s4, s2 ; GFX9-NEXT: s_subb_u32 s10, s6, 0 -; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s1, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -276,10 +280,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2 -; GFX9-NEXT: s_add_i32 s1, s8, s7 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc +; GFX9-NEXT: s_cmp_lt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 +; GFX9-NEXT: s_add_i32 s1, s8, s7 ; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -16,12 +16,14 @@ ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s5 ; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_gt_i32 s5, 51 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s5, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s5, 51 ; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -155,12 +157,14 @@ ; SI-NEXT: s_brev_b32 s15, 1 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, s15 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s14, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_gt_i32 s14, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s14, 51 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -169,23 +173,25 @@ ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 ; SI-NEXT: s_add_i32 s7, s0, s7 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] ; SI-NEXT: s_and_b32 s0, s9, s15 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s7, 51 ; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -250,12 +256,14 @@ ; SI-NEXT: s_brev_b32 s20, 1 ; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, s20 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_gt_i32 s19, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc @@ -268,40 +276,44 @@ ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] ; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_cmp_lt_i32 s17, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cmp_gt_i32 s17, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 ; SI-NEXT: s_add_i32 s10, s0, s18 -; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v6, s9 ; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] -; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_and_b32 s0, s15, s20 +; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc -; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc @@ -309,22 +321,24 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] ; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] ; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s15 ; SI-NEXT: s_add_i32 s8, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v10, s15 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] ; SI-NEXT: s_and_b32 s0, s13, s20 +; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc @@ -396,194 +410,210 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s7, 0xfc01 -; SI-NEXT: s_mov_b32 s5, 0xfffff -; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0x19 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_movk_i32 s15, 0xfc01 +; SI-NEXT: s_mov_b32 s13, 0xfffff +; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s26, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s26 -; SI-NEXT: s_brev_b32 s27, 1 -; SI-NEXT: s_andn2_b64 s[24:25], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s27 +; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 +; SI-NEXT: s_add_i32 s6, s2, s15 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s6 +; SI-NEXT: s_brev_b32 s33, 1 +; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[2:3] +; SI-NEXT: s_and_b32 s2, s19, s33 +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s26, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s26, 51 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s25, s2, s7 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], -v[0:1] +; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 +; SI-NEXT: s_add_i32 s6, s2, s15 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_brev_b32 s24, -2 -; SI-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_bfi_b32 v4, s24, v18, v4 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s25 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s6 +; SI-NEXT: s_brev_b32 s34, -2 +; SI-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v4, s19 +; SI-NEXT: v_bfi_b32 v4, s34, v14, v4 +; SI-NEXT: s_andn2_b64 s[4:5], s[16:17], s[2:3] +; SI-NEXT: s_and_b32 s2, s17, s33 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s27 +; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s16 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: v_add_f64 v[4:5], s[16:17], -v[0:1] +; SI-NEXT: s_bfe_u32 s2, s23, 0xb0014 +; SI-NEXT: s_add_i32 s6, s2, s15 +; SI-NEXT: v_mov_b32_e32 v6, s17 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s24, v18, v6 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s6 +; SI-NEXT: v_bfi_b32 v6, s34, v14, v6 +; SI-NEXT: s_andn2_b64 s[4:5], s[22:23], s[2:3] +; SI-NEXT: s_and_b32 s2, s23, s33 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s27 +; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v6, s22 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] -; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], -v[4:5] +; SI-NEXT: s_bfe_u32 s2, s21, 0xb0014 +; SI-NEXT: s_add_i32 s6, s2, s15 +; SI-NEXT: v_mov_b32_e32 v8, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v8, s24, v18, v8 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s6 +; SI-NEXT: v_bfi_b32 v8, s34, v14, v8 +; SI-NEXT: s_andn2_b64 s[4:5], s[20:21], s[2:3] +; SI-NEXT: s_and_b32 s2, s21, s33 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v8, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[12:13], s[2:3] -; SI-NEXT: s_and_b32 s2, s13, s27 +; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v8, s20 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3] -; SI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] -; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: s_bfe_u32 s2, s27, 0xb0014 +; SI-NEXT: s_add_i32 s4, s2, s15 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], -v[4:5] +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s4 +; SI-NEXT: v_mov_b32_e32 v10, s21 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 -; SI-NEXT: v_bfi_b32 v10, s24, v18, v10 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_andn2_b64 s[16:17], s[26:27], s[2:3] +; SI-NEXT: s_and_b32 s2, s27, s33 +; SI-NEXT: v_bfi_b32 v10, s34, v14, v10 +; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[2:3] -; SI-NEXT: s_and_b32 s2, s19, s27 +; SI-NEXT: s_cselect_b64 vcc, 1, 0 +; SI-NEXT: s_cmp_gt_i32 s4, 51 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v13, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v9, s18 -; SI-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[2:3] -; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 -; SI-NEXT: s_add_i32 s12, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s12 -; SI-NEXT: s_andn2_b64 s[8:9], s[16:17], s[2:3] -; SI-NEXT: s_bfe_u32 s2, s23, 0xb0014 -; SI-NEXT: s_add_i32 s14, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s14 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 +; SI-NEXT: s_bfe_u32 s4, s25, 0xb0014 +; SI-NEXT: s_add_i32 s6, s4, s15 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s6 +; SI-NEXT: s_andn2_b64 s[18:19], s[24:25], s[4:5] +; SI-NEXT: s_and_b32 s4, s25, s33 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: v_cndmask_b32_e32 v15, v8, v9, vcc +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b64 s[6:7], 1, 0 +; SI-NEXT: s_bfe_u32 s8, s31, 0xb0014 +; SI-NEXT: s_add_i32 s17, s8, s15 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], s17 +; SI-NEXT: s_andn2_b64 s[10:11], s[30:31], s[8:9] +; SI-NEXT: s_and_b32 s8, s31, s33 ; SI-NEXT: v_mov_b32_e32 v8, s19 -; SI-NEXT: s_andn2_b64 s[10:11], s[22:23], s[2:3] -; SI-NEXT: s_and_b32 s2, s23, s27 -; SI-NEXT: v_bfi_b32 v19, s24, v18, v8 -; SI-NEXT: v_mov_b32_e32 v9, s2 +; SI-NEXT: s_cmp_lt_i32 s17, 0 +; SI-NEXT: v_cndmask_b32_e64 v17, v8, v9, s[4:5] +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: s_cselect_b64 s[8:9], 1, 0 ; SI-NEXT: v_mov_b32_e32 v8, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s14, 0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s23 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s14, 51 -; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s10 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[2:3] -; SI-NEXT: s_bfe_u32 s2, s21, 0xb0014 -; SI-NEXT: s_add_i32 s7, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s7 -; SI-NEXT: s_andn2_b64 s[4:5], s[20:21], s[2:3] -; SI-NEXT: s_and_b32 s2, s21, s27 -; SI-NEXT: v_mov_b32_e32 v11, s2 -; SI-NEXT: v_mov_b32_e32 v10, s5 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_cndmask_b32_e64 v15, v10, v11, s[2:3] -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_cndmask_b32_e64 v14, v10, v11, s[2:3] -; SI-NEXT: v_add_f64 v[10:11], s[20:21], -v[14:15] -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[8:9] -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v17, s24, v18, v17 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[2:3] +; SI-NEXT: s_cmp_gt_i32 s17, 51 +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v9, s31 +; SI-NEXT: s_cselect_b64 s[10:11], 1, 0 +; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[10:11] +; SI-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v10, s30 +; SI-NEXT: s_bfe_u32 s8, s29, 0xb0014 +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[10:11] +; SI-NEXT: s_add_i32 s10, s8, s15 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], s10 +; SI-NEXT: s_andn2_b64 s[12:13], s[28:29], s[8:9] +; SI-NEXT: s_and_b32 s8, s29, s33 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: s_cselect_b64 s[8:9], 1, 0 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[8:9] +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: s_cselect_b64 s[10:11], 1, 0 +; SI-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[10:11] +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_cndmask_b32_e64 v12, v10, v11, s[10:11] +; SI-NEXT: v_add_f64 v[10:11], s[28:29], -v[12:13] +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v10, s31 +; SI-NEXT: v_bfi_b32 v20, s34, v14, v10 +; SI-NEXT: v_add_f64 v[10:11], s[30:31], -v[8:9] +; SI-NEXT: v_bfi_b32 v19, s34, v14, v19 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v10, 0 -; SI-NEXT: v_bfi_b32 v16, s24, v18, v16 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, v20, s[10:11] ; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11] -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v16, vcc +; SI-NEXT: v_cndmask_b32_e64 v9, 0, v19, s[8:9] ; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_and_b32 s13, s17, s27 -; SI-NEXT: v_add_f64 v[8:9], v[14:15], v[8:9] -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v15, s13 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s12, 0 -; SI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s12, 51 -; SI-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s8 -; SI-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_cndmask_b32_e64 v16, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v18, s24, v18, v14 -; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[16:17] -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_add_f64 v[8:9], v[12:13], v[8:9] +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_cndmask_b32_e64 v13, v15, v16, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[2:3] +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_bfi_b32 v19, s34, v14, v15 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_cndmask_b32_e64 v16, v15, v16, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[6:7] +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_bfi_b32 v18, s34, v14, v15 +; SI-NEXT: v_add_f64 v[14:15], s[24:25], -v[16:17] +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; SI-NEXT: v_add_f64 v[14:15], s[18:19], -v[12:13] -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_add_f64 v[14:15], s[26:27], -v[12:13] +; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v14, 0 ; SI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] @@ -592,10 +622,10 @@ ; SI-NEXT: v_mov_b32_e32 v12, 0 ; SI-NEXT: v_add_f64 v[12:13], v[16:17], v[12:13] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[12:15], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[12:15], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -8,7 +8,9 @@ ; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 -; GCN: v_cmp_eq_u32 +; GCN: s_cmp_eq_u32 +; GCN: s_cselect_b64 + ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -206,33 +206,33 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 -; GCN-NEXT: s_mov_b32 s2, lds@abs32@lo -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-NEXT: ; implicit-def: $sgpr3 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: BB2_1: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], s2, 4 -; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cmp_lg_u32 lds@abs32@lo, 4 +; GCN-NEXT: s_cselect_b64 s[8:9], 1, 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_cmp_gt_i32 s3, -1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_cmp_gt_i32 s4, -1 ; GCN-NEXT: s_cbranch_scc1 BB2_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GCN-NEXT: BB2_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1 -; GCN-NEXT: s_add_i32 s3, s3, 1 -; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; GCN-NEXT: s_add_i32 s4, s4, 1 +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execnz BB2_1 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -384,7 +384,7 @@ ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use ; SI-NOT: v_min -; GCN: v_cmp_lt_u32 +; GCN: s_cmp_lt_u32 ; SI-NOT: v_min ; SI: v_cndmask_b32 ; SI-NOT: v_min diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -35,7 +35,10 @@ } ; GCN-LABEL: {{^}}negated_cond_dominated_blocks: -; GCN: v_cmp_ne_u32_e64 [[CC1:[^,]+]], + + +; GCN: s_cmp_lg_u32 +; GCN: s_cselect_b64 [[CC1:[^,]+]], 1, 0 ; GCN: s_branch [[BB1:BB[0-9]+_[0-9]+]] ; GCN: [[BB0:BB[0-9]+_[0-9]+]] ; GCN-NOT: v_cndmask_b32 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -262,7 +262,7 @@ } ; FUNC-LABEL: {{^}}s_or_i1: -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -133,8 +133,9 @@ } ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: +; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b @@ -254,7 +255,7 @@ ; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: ; GCN: s_load_dword -; GCN: s_bfe_u32 +; GCN-DAG: s_bfe_u32 ; GCN-DAG: s_sub_i32 ; GCN-DAG: s_and_b32 ; GCN-DAG: s_sub_i32 @@ -273,8 +274,9 @@ } ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1: -; GCN: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -93,14 +93,15 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0 -; SI-NEXT: s_add_i32 s9, s8, s9 -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_add_i32 s12, s8, s9 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], 1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_cselect_b64 s[8:9], 1, 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s2 @@ -116,13 +117,14 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; VI-NEXT: s_add_i32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: s_add_i32 s4, s0, s1 +; VI-NEXT: s_cmp_lt_i32 s1, 0 +; VI-NEXT: s_cselect_b64 s[2:3], 1, 0 +; VI-NEXT: s_cmp_lt_i32 s4, s0 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -136,13 +138,14 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_add_i32 s4, s0, s1 +; GFX9-NEXT: s_cmp_lt_i32 s1, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], 1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s4, s0 +; GFX9-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -155,27 +155,29 @@ ; GCN-IR-NEXT: s_subb_u32 s11, s7, s2 ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[8:9], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s6, s0, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s6 ; GCN-IR-NEXT: s_subb_u32 s7, s1, s8 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s7 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s15 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s11 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 0 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s10 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s11 +; GCN-IR-NEXT: s_cmp_eq_u32 s11, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v0, s15 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[14:15], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[12:13], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[12:13], s[0:1], -1 @@ -1007,51 +1009,55 @@ ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i32 s6, s7, 31 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s10, s0, s2 -; GCN-IR-NEXT: s_mov_b32 s7, s6 -; GCN-IR-NEXT: s_subb_u32 s11, s1, s2 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s8, s0, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s8 -; GCN-IR-NEXT: s_subb_u32 s9, s1, s6 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s9 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s15 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s11 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s10, s6, s2 +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_subb_u32 s11, s7, s2 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s10 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s11 +; GCN-IR-NEXT: s_cmp_eq_u32 s11, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v0, s15 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s14 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[14:15], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[8:9], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[12:13], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[12:13], s[0:1], -1 ; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], vcc ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b64 vcc, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB9_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v0 @@ -1064,10 +1070,10 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 -; GCN-IR-NEXT: s_add_u32 s10, s8, -1 +; GCN-IR-NEXT: s_add_u32 s10, s6, -1 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 +; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 @@ -1082,9 +1088,9 @@ ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 +; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 +; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc @@ -1112,16 +1118,16 @@ ; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-IR-NEXT: BB9_7: ; %Flow4 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] ; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1263,13 +1269,14 @@ ; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s6, s0, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s1, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_subb_u32 s7, s1, s2 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc5, v2 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -166,7 +166,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 [[VCC:.*]], 1, 0 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, |[[X]]|, [[VCC]] ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { @@ -185,7 +186,8 @@ ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000 -; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 [[VCC:.*]], 1, 0 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]] ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 { @@ -220,8 +222,8 @@ ; GCN-LABEL: {{^}}add_select_posk_fabs_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] - -; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 { @@ -407,7 +409,8 @@ ; GCN-LABEL: {{^}}add_select_negk_negk_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 +; GCN: s_cmp_eq_u32 +; GCN: s_cselect_b64 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { @@ -424,7 +427,8 @@ ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 +; GCN: s_cmp_eq_u32 +; GCN: s_cselect_b64 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { @@ -455,7 +459,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { @@ -490,7 +495,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { @@ -632,7 +638,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_cmp_eq_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 [[VCC:.*]], 1, 0 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]] ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 { @@ -651,7 +658,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 [[VCC:.*]], 1, 0 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]] ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 { @@ -688,7 +696,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_ne_u32_e64 vcc +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll --- a/llvm/test/CodeGen/AMDGPU/select-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -5,9 +5,11 @@ ; scalar compares, we don't want to use multiple condition registers. ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32: -; GCN-DAG: v_cmp_ne_u32_e32 vcc, -; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 vcc, vcc, [[CMP1]] +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] @@ -37,9 +39,11 @@ } ; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32: -; GCN-DAG: v_cmp_ne_u32_e32 vcc, -; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 vcc, vcc, [[CMP1]] +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]] ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} @@ -69,9 +73,11 @@ } ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32: -; GCN-DAG: v_cmp_ne_u32_e32 vcc, -; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 vcc, vcc, [[CMP1]] +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] @@ -102,9 +108,11 @@ } ; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32: -; GCN-DAG: v_cmp_ne_u32_e32 vcc, -; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 vcc, vcc, [[CMP1]] +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]] ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -180,7 +180,8 @@ ; GCN-LABEL: {{^}}v_select_v4i32: ; GCN: buffer_load_dwordx4 -; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 +; GCN: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} @@ -218,8 +219,8 @@ ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] -; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} - +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; GCN-DAG: v_cndmask_b32_e32 @@ -232,7 +233,8 @@ } ; GCN-LABEL: {{^}}s_select_v3f32: -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 @@ -249,7 +251,8 @@ ; GCN-LABEL: {{^}}s_select_v4f32: ; GCN: s_load_dwordx4 ; GCN: s_load_dwordx4 -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 @@ -266,7 +269,9 @@ ; GCN-LABEL: {{^}}v_select_v4f32: ; GCN: buffer_load_dwordx4 -; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 + +; GCN-DAG: s_cmp_lt_u32 s{{[0-9]+}}, 32 +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} @@ -282,7 +287,8 @@ } ; GCN-LABEL: {{^}}s_select_v5f32: -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_cselect_b64 vcc, 1, 0 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll --- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -68,7 +68,7 @@ } ; FUNC-LABEL: {{^}}selectcc_bool: -; SI: v_cmp_ne_u32 +; SI: s_cmp_lg_u32 ; SI: v_cndmask_b32_e64 ; SI-NOT: cmp ; SI-NOT: cndmask diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll --- a/llvm/test/CodeGen/AMDGPU/selectcc.ll +++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}selectcc_i64: ; EG: XOR_INT @@ -8,9 +8,11 @@ ; EG: OR_INT ; EG: CNDE_INT ; EG: CNDE_INT -; SI: v_cmp_eq_u64 -; SI: v_cndmask -; SI: v_cndmask +; SI: v_cmp_eq_u64_e32 +; VI: s_cmp_eq_u64 +; VI: s_cselect_b64 vcc, 1, 0 +; GCN: v_cndmask +; GCN: v_cndmask define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { entry: %0 = icmp eq i64 %lhs, %rhs diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -4,8 +4,9 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: ; GCN-NOT: v_cmp -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT:buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm @@ -21,8 +22,9 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: ; GCN-NOT: v_cmp -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm @@ -38,8 +40,9 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1: ; GCN-NOT: v_cmp -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -52,8 +55,9 @@ ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1: ; GCN-NOT: v_cmp -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -66,8 +70,9 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: ; GCN-NOT: v_cmp -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -80,8 +85,9 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: ; GCN-NOT: v_cmp -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -94,8 +100,9 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: ; GCN-NOT: v_cmp -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -108,8 +115,9 @@ ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: ; GCN-NOT: v_cmp -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN-NEXT: buffer_store_byte [[RESULT]] define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b @@ -149,14 +157,16 @@ ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] ; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] -; SI: v_cmp_ne_u32_e32 vcc, [[B]], [[VK255]] +; SI: s_cmp_lg_u32 [[B]], [[K255]] +; SI: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] ; VI-DAG: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] +; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { @@ -200,9 +210,9 @@ ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff ; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] -; GCN: v_cmp_ne_u32_e32 vcc, [[B]], [[VK]]{{$}} -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}} +; SI: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -7,8 +7,12 @@ ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y -; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e32 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK2]] define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> @@ -22,10 +26,19 @@ ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e32 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK3:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_cselect_b64 [[MASK4:s\[[0-9]+:[0-9]+\]]], 1, 0 + +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK3]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK4]] define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -231,7 +244,7 @@ ; FUNC-LABEL: {{^}}i32_eq: ; R600: SETE_INT -; GCN: v_cmp_eq_u32 +; GCN: s_cmp_eq_u32 define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp eq i32 %a, %b @@ -242,7 +255,7 @@ ; FUNC-LABEL: {{^}}i32_ne: ; R600: SETNE_INT -; GCN: v_cmp_ne_u32 +; GCN: s_cmp_lg_u32 define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ne i32 %a, %b @@ -253,7 +266,7 @@ ; FUNC-LABEL: {{^}}i32_ugt: ; R600: SETGT_UINT -; GCN: v_cmp_gt_u32 +; GCN: s_cmp_gt_u32 define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ugt i32 %a, %b @@ -264,7 +277,7 @@ ; FUNC-LABEL: {{^}}i32_uge: ; R600: SETGE_UINT -; GCN: v_cmp_ge_u32 +; GCN: s_cmp_ge_u32 define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp uge i32 %a, %b @@ -275,7 +288,7 @@ ; FUNC-LABEL: {{^}}i32_ult: ; R600: SETGT_UINT -; GCN: v_cmp_lt_u32 +; GCN: s_cmp_lt_u32 define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ult i32 %a, %b @@ -286,7 +299,7 @@ ; FUNC-LABEL: {{^}}i32_ule: ; R600: SETGE_UINT -; GCN: v_cmp_le_u32 +; GCN: s_cmp_le_u32 define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ule i32 %a, %b @@ -297,7 +310,7 @@ ; FUNC-LABEL: {{^}}i32_sgt: ; R600: SETGT_INT -; GCN: v_cmp_gt_i32 +; GCN: s_cmp_gt_i32 define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sgt i32 %a, %b @@ -308,7 +321,7 @@ ; FUNC-LABEL: {{^}}i32_sge: ; R600: SETGE_INT -; GCN: v_cmp_ge_i32 +; GCN: s_cmp_ge_i32 define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sge i32 %a, %b @@ -319,7 +332,7 @@ ; FUNC-LABEL: {{^}}i32_slt: ; R600: SETGT_INT -; GCN: v_cmp_lt_i32 +; GCN: s_cmp_lt_i32 define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp slt i32 %a, %b @@ -330,7 +343,7 @@ ; FUNC-LABEL: {{^}}i32_sle: ; R600: SETGE_INT -; GCN: v_cmp_le_i32 +; GCN: s_cmp_le_i32 define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sle i32 %a, %b @@ -417,8 +430,8 @@ } ; FUNC-LABEL: setcc_v2i32_expand -; GCN: v_cmp_gt_i32 -; GCN: v_cmp_gt_i32 +; GCN: s_cmp_gt_i32 +; GCN: s_cmp_gt_i32 define amdgpu_kernel void @setcc_v2i32_expand( <2 x i32> addrspace(1)* %a, <2 x i32> addrspace(1)* %b, @@ -442,10 +455,10 @@ } ; FUNC-LABEL: setcc_v4i32_expand -; GCN: v_cmp_gt_i32 -; GCN: v_cmp_gt_i32 -; GCN: v_cmp_gt_i32 -; GCN: v_cmp_gt_i32 +; GCN: s_cmp_gt_i32 +; GCN: s_cmp_gt_i32 +; GCN: s_cmp_gt_i32 +; GCN: s_cmp_gt_i32 define amdgpu_kernel void @setcc_v4i32_expand( <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll --- a/llvm/test/CodeGen/AMDGPU/setcc64.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll @@ -159,7 +159,10 @@ ;;;==========================================================================;;; ; GCN-LABEL: {{^}}i64_eq: -; GCN: v_cmp_eq_u64 +; SI: v_cmp_eq_u64 +; VI: s_cmp_eq_u64 +; VI: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, [[MASK]] define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp eq i64 %a, %b @@ -169,7 +172,8 @@ } ; GCN-LABEL: {{^}}i64_ne: -; GCN: v_cmp_ne_u64 +; SI: v_cmp_ne_u64 +; VI: s_cmp_lg_u64 define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ne i64 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -192,14 +192,16 @@ ; GCN-NEXT: s_sub_i32 s2, s8, 64 ; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s8 ; GCN-NEXT: s_lshr_b64 s[10:11], s[4:5], s9 -; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 +; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN-NEXT: s_cmp_lt_u32 s8, 64 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s10 @@ -231,12 +233,14 @@ ; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 ; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] ; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GCN-NEXT: s_cmp_lt_u32 s8, 64 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s10 @@ -260,25 +264,27 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 ; GCN-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_sub_i32 s0, s8, 64 +; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 +; GCN-NEXT: s_cmp_lt_u32 s8, 64 ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_sub_i32 s0, s8, 64 ; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s0 ; GCN-NEXT: s_sub_i32 s0, 64, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s0 ; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -76,7 +76,8 @@ ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; SI-NEXT: s_cmp_eq_u32 s0, 0 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec ; SI-NEXT: BB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] @@ -100,7 +101,8 @@ ; FLAT-NEXT: ; %bb.1: ; %else ; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; FLAT-NEXT: s_cmp_eq_u32 s0, 0 +; FLAT-NEXT: s_cselect_b64 s[0:1], 1, 0 ; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec ; FLAT-NEXT: BB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] @@ -169,11 +171,14 @@ ; SI-NEXT: s_load_dword s8, s[0:1], 0xc ; SI-NEXT: s_brev_b32 s9, 44 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3 -; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cselect_b64 s[4:5], 1, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 4 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: s_cmp_gt_i32 s3, 3 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 +; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_and_b64 s[0:1], exec, s[0:1] ; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 @@ -182,49 +187,48 @@ ; SI-NEXT: s_branch BB3_4 ; SI-NEXT: BB3_1: ; %Flow6 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[10:11], 0 +; SI-NEXT: s_mov_b64 s[12:13], 0 ; SI-NEXT: BB3_2: ; %Flow5 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 ; SI-NEXT: s_mov_b64 s[14:15], 0 ; SI-NEXT: BB3_3: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[12:13] +; SI-NEXT: s_and_b64 vcc, exec, s[10:11] ; SI-NEXT: s_cbranch_vccnz BB3_8 ; SI-NEXT: BB3_4: ; %while.cond ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_mov_b64 s[14:15], -1 -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[0:1] ; SI-NEXT: s_cbranch_vccz BB3_3 ; SI-NEXT: ; %bb.5: ; %convex.exit ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: s_cbranch_vccz BB3_2 ; SI-NEXT: ; %bb.6: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[4:5] ; SI-NEXT: s_cbranch_vccz BB3_1 ; SI-NEXT: ; %bb.7: ; %if.else ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], 0 +; SI-NEXT: s_mov_b64 s[10:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_branch BB3_1 ; SI-NEXT: BB3_8: ; %loop.exit.guard4 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[10:11] +; SI-NEXT: s_and_b64 vcc, exec, s[12:13] ; SI-NEXT: s_cbranch_vccz BB3_4 ; SI-NEXT: ; %bb.9: ; %loop.exit.guard ; SI-NEXT: s_and_b64 vcc, exec, s[14:15] ; SI-NEXT: s_cbranch_vccz BB3_13 ; SI-NEXT: ; %bb.10: ; %for.cond.preheader -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, 0x3e8 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0 -; SI-NEXT: s_and_b64 vcc, exec, vcc +; SI-NEXT: s_cmpk_lt_i32 s8, 0x3e8 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz BB3_13 ; SI-NEXT: ; %bb.11: ; %for.body ; SI-NEXT: s_and_b64 vcc, exec, 0 @@ -243,11 +247,14 @@ ; FLAT-NEXT: s_load_dword s8, s[0:1], 0x30 ; FLAT-NEXT: s_brev_b32 s9, 44 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 -; FLAT-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 -; FLAT-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3 -; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] -; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; FLAT-NEXT: s_cmp_lt_i32 s2, 1 +; FLAT-NEXT: s_cselect_b64 s[4:5], 1, 0 +; FLAT-NEXT: s_cmp_lt_i32 s3, 4 +; FLAT-NEXT: s_cselect_b64 s[0:1], 1, 0 +; FLAT-NEXT: s_cmp_gt_i32 s3, 3 +; FLAT-NEXT: s_cselect_b64 s[2:3], 1, 0 +; FLAT-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; FLAT-NEXT: s_and_b64 s[0:1], exec, s[0:1] ; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 @@ -256,48 +263,48 @@ ; FLAT-NEXT: s_branch BB3_4 ; FLAT-NEXT: BB3_1: ; %Flow6 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[10:11], 0 +; FLAT-NEXT: s_mov_b64 s[12:13], 0 ; FLAT-NEXT: BB3_2: ; %Flow5 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 ; FLAT-NEXT: s_mov_b64 s[14:15], 0 ; FLAT-NEXT: BB3_3: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] +; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11] ; FLAT-NEXT: s_cbranch_vccnz BB3_8 ; FLAT-NEXT: BB3_4: ; %while.cond ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_mov_b64 s[14:15], -1 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[0:1] ; FLAT-NEXT: s_cbranch_vccz BB3_3 ; FLAT-NEXT: ; %bb.5: ; %convex.exit ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[2:3] ; FLAT-NEXT: s_cbranch_vccz BB3_2 ; FLAT-NEXT: ; %bb.6: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[4:5] ; FLAT-NEXT: s_cbranch_vccz BB3_1 ; FLAT-NEXT: ; %bb.7: ; %if.else ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], 0 +; FLAT-NEXT: s_mov_b64 s[10:11], 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_branch BB3_1 ; FLAT-NEXT: BB3_8: ; %loop.exit.guard4 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11] +; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] ; FLAT-NEXT: s_cbranch_vccz BB3_4 ; FLAT-NEXT: ; %bb.9: ; %loop.exit.guard ; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] ; FLAT-NEXT: s_cbranch_vccz BB3_13 ; FLAT-NEXT: ; %bb.10: ; %for.cond.preheader -; FLAT-NEXT: v_mov_b32_e32 v0, 0x3e8 -; FLAT-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0 -; FLAT-NEXT: s_and_b64 vcc, exec, vcc +; FLAT-NEXT: s_cmpk_lt_i32 s8, 0x3e8 +; FLAT-NEXT: s_cselect_b64 s[0:1], 1, 0 +; FLAT-NEXT: s_and_b64 vcc, exec, s[0:1] ; FLAT-NEXT: s_cbranch_vccz BB3_13 ; FLAT-NEXT: ; %bb.11: ; %for.body ; FLAT-NEXT: s_and_b64 vcc, exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -6,8 +6,10 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e64 s[2:3], s0, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], 1, 0 +; CHECK-NEXT: s_cmp_eq_u32 s1, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], 1, 0 ; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz BB0_3 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -10,9 +10,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -23,9 +23,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -80,9 +80,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -94,9 +94,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -218,9 +218,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -231,9 +231,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -254,11 +254,11 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_cselect_b64 s[2:3], 1, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -270,11 +270,11 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: s_cselect_b64 s[2:3], 1, 0 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -294,9 +294,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 +; SI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -309,9 +309,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s1, s2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 +; VI-NEXT: s_cselect_b64 s[0:1], 1, 0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -15,7 +15,8 @@ ; uses an SGPR (implicit vcc). ; GCN-LABEL: {{^}}sint_to_fp_i1_f64: -; GCN-DAG: v_cmp_eq_u32_e64 vcc, +; GCN-DAG: s_cmp_eq +; GCN-DAG: s_cselect_b64 ; GCN-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll @@ -77,7 +77,8 @@ } ; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32: -; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: s_cmp_eq_u32 +; SI: s_cselect_b64 [[CMP:s\[[0-9]+:[0-9]\]]], ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -130,25 +130,27 @@ ; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[10:11], 0, 0, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[8:9], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -1034,27 +1036,29 @@ ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: s_sub_u32 s8, s8, s0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s8 ; GCN-IR-NEXT: s_subb_u32 s9, s9, s0 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s9 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s8 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s9 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[12:13], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[10:11], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[10:11], s[0:1], -1 @@ -1185,67 +1189,71 @@ ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 24 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s0, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s1, s2 -; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s8 -; GCN-IR-NEXT: s_subb_u32 s9, s9, s10 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s9 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5 +; GCN-IR-NEXT: s_cmp_eq_u32 s5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s12 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[12:13], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[8:9], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[8:9], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[10:11], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[10:11], s[0:1], -1 ; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], vcc ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b64 vcc, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB9_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1] ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 63, v0 -; GCN-IR-NEXT: v_lshl_b64 v[0:1], s[6:7], v0 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], s[4:5], v0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 +; GCN-IR-NEXT: s_add_u32 s10, s6, -1 +; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[4:5], v4 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 +; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 @@ -1260,9 +1268,9 @@ ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 +; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 +; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc @@ -1277,9 +1285,9 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB9_3 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_4: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; GCN-IR-NEXT: s_branch BB9_7 ; GCN-IR-NEXT: BB9_5: @@ -1290,24 +1298,24 @@ ; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-IR-NEXT: BB9_7: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s9, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1443,13 +1451,14 @@ ; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[6:7], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc5, v2 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[6:7], 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -131,25 +131,27 @@ ; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[10:11], 0, 0, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[8:9], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -703,39 +705,40 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s5, 0xff000000 -; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s7, 0xff000000 +; GCN-NEXT: s_mov_b32 s6, 0xffff +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s3, s3, s4 +; GCN-NEXT: s_and_b32 s2, s2, s7 +; GCN-NEXT: s_and_b32 s3, s3, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN-NEXT: s_load_dword s7, s[0:1], 0xc -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[0:1], 0xc ; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s7, s7, s4 -; GCN-NEXT: s_and_b32 s6, s6, s5 -; GCN-NEXT: s_sub_u32 s8, 0, s2 +; GCN-NEXT: s_and_b32 s6, s0, s6 +; GCN-NEXT: s_and_b32 s8, s8, s7 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_subb_u32 s9, 0, s3 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: s_sub_u32 s2, 0, s0 +; GCN-NEXT: s_subb_u32 s3, 0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v2 +; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 @@ -750,14 +753,14 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3 +; GCN-NEXT: v_add_i32_e64 v1, s[0:1], v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v5, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v7, s9, v1 +; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v5, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v6, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v7, s3, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s2, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_mul_lo_u32 v11, v1, v5 ; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 @@ -774,10 +777,10 @@ ; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3] +; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24 +; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_hi_u32 v5, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 @@ -791,80 +794,82 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v1 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0 -; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v1 +; GCN-NEXT: v_mul_lo_u32 v10, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 +; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 +; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc ; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s8, 0xffff -; GCN-IR-NEXT: s_mov_b32 s9, 0xff000000 +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s4, 0xffff +; GCN-IR-NEXT: s_mov_b32 s7, 0xff000000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s3, s8 -; GCN-IR-NEXT: s_and_b32 s0, s2, s9 -; GCN-IR-NEXT: s_and_b32 s3, s7, s8 -; GCN-IR-NEXT: s_and_b32 s2, s6, s9 -; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_and_b32 s3, s3, s4 +; GCN-IR-NEXT: s_and_b32 s2, s2, s7 +; GCN-IR-NEXT: s_and_b32 s5, s5, s4 +; GCN-IR-NEXT: s_and_b32 s4, s6, s7 +; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 +; GCN-IR-NEXT: s_add_i32 s4, s4, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s4 +; GCN-IR-NEXT: s_flbit_i32_b32 s4, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s4, s4, 32 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s5 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s4 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[10:11], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[8:9], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[0:1], -1 ; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], vcc ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_mov_b64 vcc, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB7_4 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v0 @@ -1055,11 +1060,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc5, v2 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[2:3], 0, -1, vcc @@ -1500,31 +1506,31 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v4, v0, 24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 24, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v1, 24 +; GCN-NEXT: v_mul_hi_u32 v5, v0, 24 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GCN-NEXT: v_mul_lo_u32 v8, v0, 24 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8 +; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v8 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1535,11 +1541,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 59, v2 ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[2:3], 0, 0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -75,7 +75,8 @@ ; uses an SGPR (implicit vcc). ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64: -; GCN-DAG: v_cmp_eq_u32_e64 vcc +; GCN-DAG: s_cmp_eq_u32 +; GCN-DAG: s_cselect_b64 vcc ; GCN-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -77,7 +77,8 @@ } ; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32: -; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: s_cmp_eq +; SI: s_cselect_b64 [[CMP:s\[[0-9]+:[0-9]\]]], ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -251,7 +251,9 @@ ; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}} ; GCN: s_cmp_lt_i32 s[[COND0]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] -; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}} +; GCN: s_cmp_gt_i32 s[[COND1]], 0{{$}} +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: s_and_b64 vcc, exec, [[MASK]] ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -130,25 +130,27 @@ ; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s10 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[10:11], 0, 0, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[8:9], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -862,11 +864,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc5, v2 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[2:3], 0, -1, vcc @@ -1063,11 +1066,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s7 +; GCN-IR-NEXT: s_cmp_eq_u32 s7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 59, v2 ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[2:3], 0, 0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -8,7 +8,8 @@ declare double @llvm.fabs.f64(double) ; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: -; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0 +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 1, 0 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff @@ -30,9 +31,11 @@ ; However on GFX10 constant bus is limited to 2 scalar operands, not one. ; GCN-LABEL: {{^}}v_cnd_nan: -; SIVI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0 +; SIVI: s_cmp_eq_u32 s{{[0-9]+}}, 0 +; SIVI: s_cselect_b64 vcc, 1, 0 ; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc -; GFX10: v_cmp_eq_u32_e64 [[CC:s\[[0-9:]+\]]], s{{[0-9]+}}, 0 +; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0 +; GFX10: s_cselect_b64 [[CC:s\[[0-9]+:[0-9]+\]]], 1, 0 ; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]] ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -13,12 +13,15 @@ ; GCN-ALLOCA-COUNT-4: buffer_store_dword ; GCN-ALLOCA: buffer_load_dword -; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC2:[^,]+]], s{{[0-9]+}}, 2 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]] -; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC3:[^,]+]], s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]] +; GCN-PROMOTE-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] +; GCN-PROMOTE-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 2 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]] +; GCN-PROMOTE-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 3 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC3:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]] ; GCN-PROMOTE: ScratchSize: 0 @@ -320,12 +323,15 @@ ; GCN-ALLOCA-COUNT-4: buffer_store_dword ; GCN-ALLOCA: buffer_load_dword -; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC2:[^,]+]], s{{[0-9]+}}, 2 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]] -; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC3:[^,]+]], s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]] +; GCN-PROMOTE-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC1:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] +; GCN-PROMOTE-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 2 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC2:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]] +; GCN-PROMOTE-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 3 +; GCN-PROMOTE-DAG: s_cselect_b64 [[CC3:[^,]+]], 1, 0 +; GCN-PROMOTE-DAG: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]] ; GCN-PROMOTE: ScratchSize: 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -34,35 +34,42 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { ; GCN-LABEL: extract_insert_different_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] +; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v2 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v6, s8 -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3 -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_cmp_eq_u32 s5, 3 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, 2 +; GCN-NEXT: s_cselect_b64 s[0:1], 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, 1 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_cselect_b64 s[2:3], 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], 1, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GCN-NEXT: s_cmp_eq_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GCN-NEXT: s_cmp_eq_u32 s6, 3 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 +; GCN-NEXT: s_cselect_b64 vcc, 1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[4:5], s[8:11], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll --- a/llvm/test/CodeGen/AMDGPU/vselect.ll +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -7,10 +7,12 @@ ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; SI: v_cmp_gt_i32_e32 vcc -; SI: v_cndmask_b32_e32 -; SI: v_cmp_gt_i32_e32 vcc -; SI: v_cndmask_b32_e32 +; SI: s_cmp_gt_i32 +; SI: s_cselect_b64 vcc, 1, 0 +; SI: s_cmp_gt_i32 +; SI: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; SI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { entry: @@ -50,9 +52,9 @@ ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -37,8 +37,9 @@ ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64: ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0 -; GCN-DAG: v_cmp_eq_u32 -; GCN: v_cndmask_b32 +; GCN-DAG: s_cmp_eq_u32 +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[MASK]] define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 @@ -54,10 +55,11 @@ ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] -; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] -; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]] -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, [[B]] +; GCN: s_cselect_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 1, 0 +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[MASK]] + ; GCN: buffer_store_short [[RESULT]] define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { %tmp0 = icmp eq i16 %a, %b