Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,25 +25,6 @@ namespace { -class SIFoldOperands : public MachineFunctionPass { -public: - static char ID; - -public: - SIFoldOperands() : MachineFunctionPass(ID) { - initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fold Operands"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - struct FoldCandidate { MachineInstr *UseMI; union { @@ -79,6 +60,36 @@ } }; +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + + void foldOperand(MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl &FoldList, + SmallVectorImpl &CopiesToReplace) const; + + void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Fold Operands"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // End anonymous namespace. INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, @@ -141,7 +152,7 @@ return false; } -static bool isUseMIInFoldList(const std::vector &FoldList, +static bool isUseMIInFoldList(ArrayRef FoldList, const MachineInstr *MI) { for (auto Candidate : FoldList) { if (Candidate.UseMI == MI) @@ -150,7 +161,7 @@ return false; } -static bool tryAddToFoldList(std::vector &FoldList, +static bool tryAddToFoldList(SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { @@ -227,12 +238,12 @@ //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } -static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, - std::vector &FoldList, - SmallVectorImpl &CopiesToReplace, - const SIInstrInfo *TII, const SIRegisterInfo &TRI, - MachineRegisterInfo &MRI) { +void SIFoldOperands::foldOperand( + MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl &FoldList, + SmallVectorImpl &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (!isUseSafeToFold(*UseMI, UseOp)) @@ -264,7 +275,7 @@ unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); for (MachineRegisterInfo::use_iterator - RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end(); + RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); RSUse != RSE; ++RSUse) { MachineInstr *RSUseMI = RSUse->getParent(); @@ -272,7 +283,7 @@ continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); + CopiesToReplace); } return; @@ -287,8 +298,8 @@ unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); + MRI->getRegClass(DestReg) : + TRI->getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) @@ -318,7 +329,7 @@ const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); const TargetRegisterClass *FoldRC = - TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), OpToFold.getImm()); @@ -328,8 +339,8 @@ unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); + MRI->getRegClass(UseReg) : + TRI->getPhysRegClass(UseReg); assert(Imm.getBitWidth() == 64); @@ -349,20 +360,51 @@ } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, - int32_t LHS, int32_t RHS) { + uint32_t LHS, uint32_t RHS) { switch (Opcode) { case AMDGPU::V_AND_B32_e64: + case AMDGPU::V_AND_B32_e32: case AMDGPU::S_AND_B32: Result = LHS & RHS; return true; case AMDGPU::V_OR_B32_e64: + case AMDGPU::V_OR_B32_e32: case AMDGPU::S_OR_B32: Result = LHS | RHS; return true; case AMDGPU::V_XOR_B32_e64: + case AMDGPU::V_XOR_B32_e32: case AMDGPU::S_XOR_B32: Result = LHS ^ RHS; return true; + case AMDGPU::V_LSHL_B32_e64: + case AMDGPU::V_LSHL_B32_e32: + case AMDGPU::S_LSHL_B32: + // The instruction ignores the high bits for out of bounds shifts. + Result = LHS << (RHS & 31); + return true; + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32: + Result = RHS << (LHS & 31); + return true; + case AMDGPU::V_LSHR_B32_e64: + case AMDGPU::V_LSHR_B32_e32: + case AMDGPU::S_LSHR_B32: + Result = LHS >> (RHS & 31); + return true; + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32: + Result = RHS >> (LHS & 31); + return true; + case AMDGPU::V_ASHR_I32_e64: + case AMDGPU::V_ASHR_I32_e32: + case AMDGPU::S_ASHR_I32: + Result = static_cast(LHS) >> (RHS & 31); + return true; + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32: + Result = static_cast(RHS) >> (LHS & 31); + return true; default: return false; } @@ -390,33 +432,47 @@ stripExtraCopyOperands(MI); } +static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, + MachineOperand &Op) { + if (Op.isReg()) { + // If this has a subregister, it obviously is a register source. + if (Op.getSubReg() != AMDGPU::NoSubRegister) + return &Op; + + MachineInstr *Def = MRI.getVRegDef(Op.getReg()); + if (Def->isMoveImmediate()) { + MachineOperand &ImmSrc = Def->getOperand(1); + if (ImmSrc.isImm()) + return &ImmSrc; + } + } + + return &Op; +} + // Try to simplify operations with a constant that may appear after instruction // selection. +// TODO: See if a frame index with a fixed offset can fold. static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, - MachineInstr *MI) { + MachineInstr *MI, + MachineOperand *ImmOp) { unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || Opc == AMDGPU::S_NOT_B32) { - MachineOperand &Src0 = MI->getOperand(1); - if (Src0.isImm()) { - Src0.setImm(~Src0.getImm()); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); - return true; - } - - return false; + MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); + mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + return true; } - if (!MI->isCommutable()) + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) return false; int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); + MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); - MachineOperand *Src0 = &MI->getOperand(Src0Idx); - MachineOperand *Src1 = &MI->getOperand(Src1Idx); if (!Src0->isImm() && !Src1->isImm()) return false; @@ -431,19 +487,26 @@ const SIRegisterInfo &TRI = TII->getRegisterInfo(); bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); - Src0->setImm(NewImm); + // Be careful to change the right operand, src0 may belong to a different + // instruction. + MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); return true; } + if (!MI->isCommutable()) + return false; + if (Src0->isImm() && !Src1->isImm()) { std::swap(Src0, Src1); std::swap(Src0Idx, Src1Idx); } int32_t Src1Val = static_cast(Src1->getImm()); - if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { + if (Opc == AMDGPU::V_OR_B32_e64 || + Opc == AMDGPU::V_OR_B32_e32 || + Opc == AMDGPU::S_OR_B32) { if (Src1Val == 0) { // y = or x, 0 => y = copy x MI->RemoveOperand(Src1Idx); @@ -459,6 +522,7 @@ } if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || + MI->getOpcode() == AMDGPU::V_AND_B32_e32 || MI->getOpcode() == AMDGPU::S_AND_B32) { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 @@ -476,29 +540,136 @@ } if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || + MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || MI->getOpcode() == AMDGPU::S_XOR_B32) { if (Src1Val == 0) { // y = xor x, 0 => y = copy x MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + return true; } } return false; } +void SIFoldOperands::foldInstOperand(MachineInstr &MI, + MachineOperand &OpToFold) const { + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector CopiesToReplace; + SmallVector FoldList; + MachineOperand &Dst = MI.getOperand(0); + + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + if (FoldingImm) { + unsigned NumLiteralUses = 0; + MachineOperand *NonInlineUse = nullptr; + int NonInlineUseOpNo = -1; + + MachineRegisterInfo::use_iterator NextUse, NextInstUse; + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + MachineInstr *UseMI = Use->getParent(); + unsigned OpNo = Use.getOperandNo(); + + // Folding the immediate may reveal operations that can be constant + // folded or replaced with a copy. This can happen for example after + // frame indices are lowered to constants or from splitting 64-bit + // constants. + // + // We may also encounter cases where one or both operands are + // immediates materialized into a register, which would ordinarily not + // be folded due to multiple uses or operand constraints. + + if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { + DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); + + // Some constant folding cases change the same immediate's use to a new + // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user + // again. The same constant folded instruction could also have a second + // use operand. + NextUse = MRI->use_begin(Dst.getReg()); + continue; + } + + // Try to fold any inline immediate uses, and then only fold other + // constants if they have one use. + // + // The legality of the inline immediate must be checked based on the use + // operand, not the defining instruction, because 32-bit instructions + // with 32-bit inline immediate sources may be used to materialize + // constants used in 16-bit operands. + // + // e.g. it is unsafe to fold: + // s_mov_b32 s0, 1.0 // materializes 0x3f800000 + // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else { + if (++NumLiteralUses == 1) { + NonInlineUse = &*Use; + NonInlineUseOpNo = OpNo; + } + } + } + + if (NumLiteralUses == 1) { + MachineInstr *UseMI = NonInlineUse->getParent(); + foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); + } + } else { + // Folding register. + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); + Use != E; ++Use) { + MachineInstr *UseMI = Use->getParent(); + + foldOperand(OpToFold, UseMI, Use.getOperandNo(), + FoldList, CopiesToReplace); + } + } + + MachineFunction *MF = MI.getParent()->getParent(); + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(*MF); + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, *TRI)) { + // Clear kill flags. + if (Fold.isReg()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI->clearKillFlags(Fold.OpToFold->getReg()); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + } + } +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; const SISubtarget &ST = MF.getSubtarget(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; @@ -512,8 +683,7 @@ MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); - // FIXME: We could also be folding things like FrameIndexes and - // TargetIndexes. + // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; @@ -532,90 +702,7 @@ !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; - // We need mutate the operands of new mov instructions to add implicit - // uses of EXEC, but adding them invalidates the use_iterator, so defer - // this. - SmallVector CopiesToReplace; - - std::vector FoldList; - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end(); - Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); - unsigned OpNo = Use.getOperandNo(); - - if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, - CopiesToReplace, TII, TRI, MRI); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &*Use; - NonInlineUseOpNo = OpNo; - } - } - } - - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, - CopiesToReplace, TII, TRI, MRI); - } - } else { - // Folding register. - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end(); - Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); - - foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, - CopiesToReplace, TII, TRI, MRI); - } - } - - // Make sure we add EXEC uses to any new v_mov instructions created. - for (MachineInstr *Copy : CopiesToReplace) - Copy->addImplicitDefUseOperands(MF); - - for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, TRI)) { - // Clear kill flags. - if (Fold.isReg()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); - // FIXME: Probably shouldn't bother trying to fold if not an - // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR - // copies. - MRI.clearKillFlags(Fold.OpToFold->getReg()); - } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); - - // Folding the immediate may reveal operations that can be constant - // folded or replaced with a copy. This can happen for example after - // frame indices are lowered to constants or from splitting 64-bit - // constants. - tryConstantFoldOp(MRI, TII, Fold.UseMI); - } - } + foldInstOperand(MI, OpToFold); } } return false; Index: test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -0,0 +1,858 @@ +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s +--- | + define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %and = and i32 %a, 1234567 + store volatile i32 %and, i32 addrspace(1)* %out + ret void + } + + define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom + %a = load i32, i32 addrspace(1)* %gep.a + %and = and i32 %a, 1234567 + store i32 %and, i32 addrspace(1)* %gep.out + ret void + } + + define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %shl = shl i32 %a, 12 + store volatile i32 %shl, i32 addrspace(1)* %out + ret void + } + + define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom + %a = load i32, i32 addrspace(1)* %gep.a + %shl = shl i32 %a, 12 + store i32 %shl, i32 addrspace(1)* %gep.out + ret void + } + + define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %ashr = ashr i32 %a, 12 + store volatile i32 %ashr, i32 addrspace(1)* %out + ret void + } + + define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom + %a = load i32, i32 addrspace(1)* %gep.a + %ashr = ashr i32 %a, 12 + store i32 %ashr, i32 addrspace(1)* %gep.out + ret void + } + + define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %lshr = lshr i32 %a, 12 + store volatile i32 %lshr, i32 addrspace(1)* %out + ret void + } + + define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom + %a = load i32, i32 addrspace(1)* %gep.a + %lshr = lshr i32 %a, 12 + store i32 %lshr, i32 addrspace(1)* %gep.out + ret void + } + + declare i32 @llvm.amdgcn.workitem.id.x() #1 + + attributes #0 = { nounwind } + attributes #1 = { nounwind readnone } + +... +--- + +# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}} +# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec +# GCN: BUFFER_STORE_DWORD_OFFSET killed %10, +name: s_fold_and_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_64_xexec } + - { id: 2, class: sreg_32_xm0 } + - { id: 3, class: sreg_32_xm0 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_128 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1 + + %0 = COPY %sgpr0_sgpr1 + %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %2 = COPY %1.sub1 + %3 = COPY %1.sub0 + %4 = S_MOV_B32 61440 + %5 = S_MOV_B32 -1 + %6 = REG_SEQUENCE killed %2, 1, killed %3, 2, killed %4, 3, killed %5, 4 + %7 = S_MOV_B32 1234567 + %8 = S_MOV_B32 9999 + %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc + %10 = COPY %9 + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + S_ENDPGM + +... +--- + +# GCN-LABEL: name: v_fold_and_imm_regimm_32{{$}} + +# GCN: %9 = V_MOV_B32_e32 646, implicit %exec +# GCN: FLAT_STORE_DWORD %19, %9, + +# GCN: %10 = V_MOV_B32_e32 646, implicit %exec +# GCN: FLAT_STORE_DWORD %19, %10 + +# GCN: %11 = V_MOV_B32_e32 646, implicit %exec +# GCN: FLAT_STORE_DWORD %19, %11, + +# GCN: %12 = V_MOV_B32_e32 1234567, implicit %exec +# GCN: FLAT_STORE_DWORD %19, %12, + +# GCN: %13 = V_MOV_B32_e32 63, implicit %exec +# GCN: FLAT_STORE_DWORD %19, %13, + +name: v_fold_and_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 20, class: sreg_32_xm0 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vgpr_32 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vgpr_32 } + - { id: 32, class: vreg_64 } + - { id: 33, class: vreg_64 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } + - { id: 36, class: vgpr_32 } + - { id: 37, class: vreg_64 } + - { id: 44, class: vgpr_32 } + +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec + %32 = REG_SEQUENCE %3, 1, %31, 2 + %33 = V_LSHLREV_B64 2, killed %32, implicit %exec + %20 = COPY %4.sub1 + %44 = V_ADD_I32_e32 %4.sub0, %33.sub0, implicit-def %vcc, implicit %exec + %36 = COPY killed %20 + %35 = V_ADDC_U32_e32 %33.sub1, %36, implicit-def %vcc, implicit %vcc, implicit %exec + %37 = REG_SEQUENCE %44, 1, killed %35, 2 + %24 = V_MOV_B32_e32 982, implicit %exec + %26 = S_MOV_B32 1234567 + %34 = V_MOV_B32_e32 63, implicit %exec + + %27 = V_AND_B32_e64 %26, %24, implicit %exec + FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %28 = V_AND_B32_e64 %24, %26, implicit %exec + FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %29 = V_AND_B32_e32 %26, %24, implicit %exec + FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %30 = V_AND_B32_e64 %26, %26, implicit %exec + FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %31 = V_AND_B32_e64 %34, %34, implicit %exec + FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + S_ENDPGM + +... +--- + +# GCN-LABEL: name: s_fold_shl_imm_regimm_32{{$}} +# GC1: %13 = V_MOV_B32_e32 4096, implicit %exec +# GCN: BUFFER_STORE_DWORD_OFFSET killed %13, + +name: s_fold_shl_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0_xexec } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_128 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1 + + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_MOV_B32 1 + %6 = COPY %4.sub1 + %7 = COPY %4.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 + %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc + %13 = COPY %12 + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + S_ENDPGM + +... +--- +# GCN-LABEL: name: v_fold_shl_imm_regimm_32{{$}} + +# GCN: %11 = V_MOV_B32_e32 40955904, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %11, + +# GCN: %12 = V_MOV_B32_e32 24, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %12, + +# GCN: %13 = V_MOV_B32_e32 4096, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %13, + +# GCN: %14 = V_MOV_B32_e32 24, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %14, + +# GCN: %15 = V_MOV_B32_e32 0, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %15, + +# GCN: %22 = V_MOV_B32_e32 4096, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %22, + +# GCN: %23 = V_MOV_B32_e32 1, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %23, + +# GCN: %25 = V_MOV_B32_e32 2, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %25, + +# GCN: %26 = V_MOV_B32_e32 7927808, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %26, + +# GCN: %28 = V_MOV_B32_e32 -8, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %28, + +name: v_fold_shl_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64_xexec } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_64 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vreg_64 } + - { id: 17, class: vreg_64 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: sreg_32_xm0 } + - { id: 28, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %2 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec + %16 = REG_SEQUENCE %2, 1, %15, 2 + %17 = V_LSHLREV_B64 2, killed %16, implicit %exec + %9 = COPY %3.sub1 + %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec + %19 = COPY killed %9 + %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec + %20 = REG_SEQUENCE %21, 1, killed %18, 2 + %10 = V_MOV_B32_e32 9999, implicit %exec + %24 = V_MOV_B32_e32 3871, implicit %exec + %6 = V_MOV_B32_e32 1, implicit %exec + %7 = S_MOV_B32 1 + %27 = S_MOV_B32 -4 + + %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %13 = V_LSHL_B32_e64 %7, 12, implicit %exec + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %14 = V_LSHL_B32_e64 12, %7, implicit %exec + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %15 = V_LSHL_B32_e64 12, %24, implicit %exec + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %22 = V_LSHL_B32_e64 %6, 12, implicit %exec + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %23 = V_LSHL_B32_e64 %6, 32, implicit %exec + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %25 = V_LSHL_B32_e32 %6, %6, implicit %exec + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %28 = V_LSHL_B32_e32 %27, %6, implicit %exec + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + S_ENDPGM + +... +--- + +# GCN-LABEL: name: s_fold_ashr_imm_regimm_32{{$}} +# GCN: %11 = V_MOV_B32_e32 243, implicit %exec +# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, +name: s_fold_ashr_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0_xexec } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_128 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1 + + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_MOV_B32 999123 + %6 = COPY %4.sub1 + %7 = COPY %4.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 + %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc + %13 = COPY %12 + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + S_ENDPGM + +... + +# GCN-LABEL: name: v_fold_ashr_imm_regimm_32{{$}} +# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %11, + +# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %12, + +# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %13, + +# GCN: %14 = V_MOV_B32_e32 3, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %14, + +# GCN: %15 = V_MOV_B32_e32 -1, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %15, + +# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %22, + +# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %23, + +# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %25, + +# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %26, + +# GCN: %28 = V_MOV_B32_e32 -1, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %28, + +name: v_fold_ashr_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64_xexec } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vreg_64 } + - { id: 17, class: vreg_64 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: sreg_32_xm0 } + - { id: 28, class: vgpr_32 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_32_xm0 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %2 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec + %16 = REG_SEQUENCE %2, 1, %15, 2 + %17 = V_LSHLREV_B64 2, killed %16, implicit %exec + %9 = COPY %3.sub1 + %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec + %19 = COPY killed %9 + %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec + %20 = REG_SEQUENCE %21, 1, killed %18, 2 + %10 = V_MOV_B32_e32 999234234, implicit %exec + %24 = V_MOV_B32_e32 3871, implicit %exec + %6 = V_MOV_B32_e32 1000000, implicit %exec + %7 = S_MOV_B32 13424252 + %8 = S_MOV_B32 4 + %27 = S_MOV_B32 -4 + %32 = S_MOV_B32 1 + %33 = S_MOV_B32 3841 + %34 = V_MOV_B32_e32 3841, implicit %exec + %35 = V_MOV_B32_e32 2, implicit %exec + + %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %13 = V_ASHR_I32_e64 %7, 3, implicit %exec + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %14 = V_ASHR_I32_e64 7, %32, implicit %exec + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %15 = V_ASHR_I32_e64 %27, %24, implicit %exec + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %22 = V_ASHR_I32_e64 %6, 4, implicit %exec + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %23 = V_ASHR_I32_e64 %6, %33, implicit %exec + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %25 = V_ASHR_I32_e32 %34, %34, implicit %exec + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %28 = V_ASHR_I32_e32 %27, %35, implicit %exec + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + S_ENDPGM + +... +--- + +# GCN-LABEL: name: s_fold_lshr_imm_regimm_32{{$}} +# GCN: %11 = V_MOV_B32_e32 1048332, implicit %exec +# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, +name: s_fold_lshr_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0_xexec } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_128 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1 + + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_MOV_B32 -999123 + %6 = COPY %4.sub1 + %7 = COPY %4.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 + %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc + %13 = COPY %12 + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + S_ENDPGM + +... +--- + +# GCN-LABEL: name: v_fold_lshr_imm_regimm_32{{$}} +# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %11, + +# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %12, + +# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %13, + +# GCN: %14 = V_MOV_B32_e32 3, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %14, + +# GCN: %15 = V_MOV_B32_e32 1, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %15, + +# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %22, + +# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %23, + +# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %25, + +# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %26, + +# GCN: %28 = V_MOV_B32_e32 1073741823, implicit %exec +# GCN: FLAT_STORE_DWORD %20, %28, + +name: v_fold_lshr_imm_regimm_32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64_xexec } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vreg_64 } + - { id: 17, class: vreg_64 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: sreg_32_xm0 } + - { id: 28, class: vgpr_32 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_32_xm0 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %2 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec + %16 = REG_SEQUENCE %2, 1, %15, 2 + %17 = V_LSHLREV_B64 2, killed %16, implicit %exec + %9 = COPY %3.sub1 + %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec + %19 = COPY killed %9 + %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec + %20 = REG_SEQUENCE %21, 1, killed %18, 2 + %10 = V_MOV_B32_e32 999234234, implicit %exec + %24 = V_MOV_B32_e32 3871, implicit %exec + %6 = V_MOV_B32_e32 1000000, implicit %exec + %7 = S_MOV_B32 13424252 + %8 = S_MOV_B32 4 + %27 = S_MOV_B32 -4 + %32 = S_MOV_B32 1 + %33 = S_MOV_B32 3841 + %34 = V_MOV_B32_e32 3841, implicit %exec + %35 = V_MOV_B32_e32 2, implicit %exec + + %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %13 = V_LSHR_B32_e64 %7, 3, implicit %exec + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %14 = V_LSHR_B32_e64 7, %32, implicit %exec + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %15 = V_LSHR_B32_e64 %27, %24, implicit %exec + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %22 = V_LSHR_B32_e64 %6, 4, implicit %exec + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %23 = V_LSHR_B32_e64 %6, %33, implicit %exec + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %25 = V_LSHR_B32_e32 %34, %34, implicit %exec + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + %28 = V_LSHR_B32_e32 %27, %35, implicit %exec + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + + S_ENDPGM + +...