Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -875,8 +875,12 @@ Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + // FIXME: Select to VOP3 version for with-carry. + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); @@ -945,8 +949,11 @@ Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -76,6 +76,9 @@ private: void swapOperands(MachineInstr &Inst) const; + bool moveScalarAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const; + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -691,9 +694,7 @@ bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; - static unsigned getVALUOp(const MachineInstr &MI); - - bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; + unsigned getVALUOp(const MachineInstr &MI) const; /// \brief Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1057,9 +1057,9 @@ .addReg(TIDIGYReg) .addReg(TIDReg); // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z - BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) - .addReg(TIDReg) - .addReg(TIDIGZReg); + getAddNoCarry(Entry, Insert, DL, TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), @@ -1082,9 +1082,9 @@ // Add FrameIndex to LDS offset unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) - .addImm(LDSOffset) - .addReg(TIDReg); + getAddNoCarry(MBB, MI, DL, TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); return TmpReg; } @@ -2832,7 +2832,7 @@ return true; } -unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; @@ -2845,10 +2845,15 @@ return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; - case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; + return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADD_U32: + return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADDC_U32: + return AMDGPU::V_ADDC_U32_e32; case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; + return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUB_U32: + return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; @@ -2895,10 +2900,6 @@ } } -bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { - return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -3613,6 +3614,15 @@ splitScalar64BitAddSub(Worklist, Inst); Inst.eraseFromParent(); continue; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_ADD_U32: + case AMDGPU::S_SUB_U32: + if (moveScalarAddSub(Worklist, Inst)) + continue; + + // Default handling + break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst.eraseFromParent(); @@ -3721,6 +3731,14 @@ auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); unsigned Offset = 0; + // FIXME: This isn't safe because the addressing mode doesn't work + // correctly if vaddr is negative. + // + // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate + // being in src0. + // + // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. + // // See if we can extract an immediate offset by recognizing one of these: // V_ADD_I32_e32 dst, imm, src1 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 @@ -3729,7 +3747,7 @@ const MachineOperand *Src = getNamedOperand(*Add, AMDGPU::OpName::src0); - if (Src && Src->isReg()) { + if (Src->isReg()) { auto Mov = MRI.getUniqueVRegDef(Src->getReg()); if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) Src = &Mov->getOperand(1); @@ -3859,6 +3877,41 @@ } } +// Add/sub require special handling to deal with carry outs. +bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const { + if (ST.hasAddNoCarry()) { + // Assume there is no user of scc since we don't select this in that case. + // Since scc isn't used, it doesn't really matter if the i32 or u32 variant + // is used. + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned OldDstReg = Inst.getOperand(0).getReg(); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned Opc = Inst.getOpcode(); + assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); + + unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? + AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; + + assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); + Inst.RemoveOperand(3); + + Inst.setDesc(get(NewOpc)); + Inst.addImplicitDefUseOperands(*MBB.getParent()); + MRI.replaceRegWith(OldDstReg, ResultReg); + legalizeOperands(Inst); + + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return true; + } + + return false; +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -3871,7 +3924,10 @@ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + unsigned SubOp = ST.hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg) .addImm(0) .addReg(Src.getReg()); Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -481,9 +481,12 @@ if (CI.BaseOff) { BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(AddrReg->getReg()); + + unsigned AddOpc = STM->hasAddNoCarry() ? + AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) + .addImm(CI.BaseOff) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Read2 = @@ -554,9 +557,12 @@ if (CI.BaseOff) { BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + + unsigned AddOpc = STM->hasAddNoCarry() ? + AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) + .addImm(CI.BaseOff) + .addReg(Addr->getReg()); } MachineInstrBuilder Write2 = Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -1,14 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -;FUNC-LABEL: {{^}}test1: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; FUNC-LABEL: {{^}}s_add_i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} -;SI: v_mov_b32_e32 v[[REG]], s[[REG]] -;SI: buffer_store_dword v[[REG]], -define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} +; GCN: v_mov_b32_e32 v[[REG]], s[[REG]] +; GCN: buffer_store_dword v[[REG]], +define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -17,14 +18,13 @@ ret void } -;FUNC-LABEL: {{^}}test2: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; FUNC-LABEL: {{^}}s_add_v2i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} - -define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -33,18 +33,17 @@ ret void } -;FUNC-LABEL: {{^}}test4: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} - -define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +; FUNC-LABEL: {{^}}s_add_v4i32: +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -53,7 +52,7 @@ ret void } -; FUNC-LABEL: {{^}}test8: +; FUNC-LABEL: {{^}}s_add_v8i32: ; EG: ADD_INT ; EG: ADD_INT ; EG: ADD_INT @@ -63,22 +62,22 @@ ; EG: ADD_INT ; EG: ADD_INT -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { entry: %0 = add <8 x i32> %a, %b store <8 x i32> %0, <8 x i32> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test16: +; FUNC-LABEL: {{^}}s_add_v16i32: ; EG: ADD_INT ; EG: ADD_INT ; EG: ADD_INT @@ -96,32 +95,62 @@ ; EG: ADD_INT ; EG: ADD_INT -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { entry: %0 = add <16 x i32> %a, %b store <16 x i32> %0, <16 x i32> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}v_add_i32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[B]], [[A]] +; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 + %a = load volatile i32, i32 addrspace(1)* %gep + %b = load volatile i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_add_imm_i32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]] +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]] +define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1 + %a = load volatile i32, i32 addrspace(1)* %gep + %result = add i32 %a, 123 + store i32 %result, i32 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}add64: -; SI: s_add_u32 -; SI: s_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] ; EG-DAG: ADD_INT {{[* ]*}} @@ -131,8 +160,8 @@ ; EG-NOT: SUB define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: - %0 = add i64 %a, %b - store i64 %0, i64 addrspace(1)* %out + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out ret void } @@ -142,7 +171,7 @@ ; to a VGPR before doing the add. ; FUNC-LABEL: {{^}}add64_sgpr_vgpr: -; SI-NOT: v_addc_u32_e32 s +; GCN-NOT: v_addc_u32_e32 s ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] ; EG-DAG: ADD_INT {{[* ]*}} @@ -160,8 +189,8 @@ ; Test i64 add inside a branch. ; FUNC-LABEL: {{^}}add64_in_branch: -; SI: s_add_u32 -; SI: s_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 ; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] ; EG-DAG: ADD_INT {{[* ]*}} @@ -187,3 +216,8 @@ store i64 %3, i64 addrspace(1)* %out ret void } + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/ds-combine-large-stride.ll =================================================================== --- test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -1,15 +1,18 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: ds_read32_combine_stride_400: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] + ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 @@ -46,12 +49,15 @@ ; GCN-LABEL: ds_read32_combine_stride_400_back: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] + ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 @@ -124,12 +130,13 @@ ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 @@ -160,8 +167,8 @@ ; GCN-LABEL: ds_read64_combine_stride_400: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 @@ -198,12 +205,13 @@ ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 @@ -234,12 +242,14 @@ ; GCN-LABEL: ds_write32_combine_stride_400: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] + ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 @@ -267,12 +277,15 @@ ; GCN-LABEL: ds_write32_combine_stride_400_back: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] + ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 @@ -327,12 +340,15 @@ ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]] + ; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 @@ -356,8 +372,10 @@ ; GCN-LABEL: ds_write64_combine_stride_400: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] + ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 @@ -385,12 +403,15 @@ ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] + +; VI-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; VI-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] + +; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] +; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] + ; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 ; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 ; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 Index: test/CodeGen/AMDGPU/ds-sub-offset.ll =================================================================== --- test/CodeGen/AMDGPU/ds-sub-offset.ll +++ test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -6,7 +7,8 @@ ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global: ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0 -; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] +; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] +; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { @@ -21,7 +23,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 { @@ -36,7 +39,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_max_offset_p1: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0x10000, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}} define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { @@ -51,7 +55,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} @@ -73,7 +78,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_multi_use_same_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} @@ -93,7 +99,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -107,7 +114,8 @@ ; GCN-LABEL: {{^}}add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 -; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]] +; CI-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]] +; GFX9-DAG: v_sub_u32_e32 [[NEG:v[0-9]+]], 0x3fc, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @@ -6,12 +7,12 @@ @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 -; SI-LABEL: @simple_read2_f32 -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32 +; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -25,12 +26,12 @@ ret void } -; SI-LABEL: @simple_read2_f32_max_offset -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_max_offset +; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -44,11 +45,11 @@ ret void } -; SI-LABEL: @simple_read2_f32_too_far -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_too_far +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -62,10 +63,10 @@ ret void } -; SI-LABEL: @simple_read2_f32_x2 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 @@ -93,11 +94,11 @@ } ; Make sure there is an instruction between the two sets of reads. -; SI-LABEL: @simple_read2_f32_x2_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: s_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2_barrier +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; GCN: s_barrier +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 @@ -129,10 +130,10 @@ ; For some reason adding something to the base address for the first ; element results in only folding the inner pair. -; SI-LABEL: @simple_read2_f32_x2_nonzero_base -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2_nonzero_base +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -165,11 +166,11 @@ ; Base pointers come from different subregister of same super ; register. We can't safely merge this. -; SI-LABEL: @read2_ptr_is_subreg_arg_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm +; GCN-LABEL: @read2_ptr_is_subreg_arg_f32 +; GCN-NOT: ds_read2_b32 +; GCN: ds_read_b32 +; GCN: ds_read_b32 +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -191,11 +192,11 @@ ; sure we are really rejecting it because of the different ; subregisters. -; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm +; GCN-LABEL: @read2_ptr_is_subreg_arg_offset_f32 +; GCN-NOT: ds_read2_b32 +; GCN: ds_read_b32 +; GCN: ds_read_b32 +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -216,9 +217,9 @@ ret void } -; SI-LABEL: {{^}}read2_ptr_is_subreg_f32: -; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} -; SI: s_endpgm +; GCN-LABEL: {{^}}read2_ptr_is_subreg_f32: +; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 @@ -238,11 +239,11 @@ ret void } -; SI-LABEL: @simple_read2_f32_volatile_0 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_volatile_0 +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -256,11 +257,11 @@ ret void } -; SI-LABEL: @simple_read2_f32_volatile_1 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_volatile_1 +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -277,9 +278,9 @@ ; Can't fold since not correctly aligned. ; XXX: This isn't really testing anything useful now. I think CI ; allows unaligned LDS accesses, which would be a problem here. -; SI-LABEL: @unaligned_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm +; GCN-LABEL: @unaligned_read2_f32 +; GCN-NOT: ds_read2_b32 +; GCN: s_endpgm define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -293,9 +294,9 @@ ret void } -; SI-LABEL: @misaligned_2_simple_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm +; GCN-LABEL: @misaligned_2_simple_read2_f32 +; GCN-NOT: ds_read2_b32 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -309,12 +310,12 @@ ret void } -; SI-LABEL: @simple_read2_f64 -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} -; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64 +; GCN: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 +; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -328,9 +329,9 @@ ret void } -; SI-LABEL: @simple_read2_f64_max_offset -; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64_max_offset +; GCN: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -344,11 +345,11 @@ ret void } -; SI-LABEL: @simple_read2_f64_too_far -; SI-NOT ds_read2_b64 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64_too_far +; GCN-NOT ds_read2_b64 +; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -363,10 +364,10 @@ } ; Alignment only 4 -; SI-LABEL: @misaligned_read2_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 -; SI: s_endpgm +; GCN-LABEL: @misaligned_read2_f64 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i @@ -382,9 +383,9 @@ @foo = addrspace(3) global [4 x i32] undef, align 4 -; SI-LABEL: @load_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN-LABEL: @load_constant_adjacent_offsets +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -393,9 +394,9 @@ ret void } -; SI-LABEL: @load_constant_disjoint_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +; GCN-LABEL: @load_constant_disjoint_offsets +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -406,10 +407,10 @@ @bar = addrspace(3) global [4 x i64] undef, align 4 -; SI-LABEL: @load_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN-LABEL: @load_misaligned64_constant_offsets +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -420,12 +421,12 @@ @bar.large = addrspace(3) global [4096 x i64] undef, align 4 -; SI-LABEL: @load_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 -; SI: s_endpgm +; GCN-LABEL: @load_misaligned64_constant_large_offsets +; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 @@ -493,8 +494,8 @@ ret void } -; SI-LABEL: ds_read_diff_base_interleaving -; SI-NOT: ds_read_b32 +; GCN-LABEL: ds_read_diff_base_interleaving +; GCN-NOT: ds_read_b32 define amdgpu_kernel void @ds_read_diff_base_interleaving( float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -533,21 +534,13 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.y() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 -; Function Attrs: convergent nounwind declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readnone speculatable } attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -1,14 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 - -; SI-LABEL: @simple_write2_one_val_f32 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_one_val_f32 +; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -21,12 +21,13 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32 +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: {{flat|global}}_load_dword [[VAL0:v[0-9]+]], +; GCN-DAG: {{flat|global}}_load_dword [[VAL1:v[0-9]+]], + +; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -41,11 +42,11 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_volatile_0 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_volatile_0 +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -60,11 +61,11 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_volatile_1 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_volatile_1 +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -80,12 +81,12 @@ } ; 2 data subregisters from different super registers. -; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 -; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_subreg2_mixed_f32 +; GCN: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: {{flat|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; GCN: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -102,11 +103,11 @@ ret void } -; SI-LABEL: @simple_write2_two_val_subreg2_f32 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_subreg2_f32 +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -121,11 +122,11 @@ ret void } -; SI-LABEL: @simple_write2_two_val_subreg4_f32 -; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_subreg4_f32 +; GCN-DAG: {{flat|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i @@ -140,12 +141,12 @@ ret void } -; SI-LABEL: @simple_write2_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_max_offset_f32 +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: {{flat|global}}_load_dword [[VAL0:v[0-9]+]], +; GCN: {{flat|global}}_load_dword [[VAL1:v[0-9]+]], +; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -160,10 +161,10 @@ ret void } -; SI-LABEL: @simple_write2_two_val_too_far_f32 -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_too_far_f32 +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -178,10 +179,10 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_x2 +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x @@ -208,10 +209,10 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x @@ -238,11 +239,11 @@ ret void } -; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: s_endpgm +; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32 +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -265,11 +266,11 @@ ret void } -; SI-LABEL: @simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_one_val_f64 +; GCN-DAG: {{flat|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -282,12 +283,12 @@ ret void } -; SI-LABEL: @misaligned_simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 -; SI: s_endpgm +; GCN-LABEL: @misaligned_simple_write2_one_val_f64 +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -300,12 +301,12 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f64 +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: {{flat|global}}_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], +; GCN: {{flat|global}}_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], +; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -322,19 +323,19 @@ @foo = addrspace(3) global [4 x i32] undef, align 4 -; SI-LABEL: @store_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-LABEL: @store_constant_adjacent_offsets +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 ret void } -; SI-LABEL: @store_constant_disjoint_offsets -; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +; GCN-LABEL: @store_constant_disjoint_offsets +; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -343,11 +344,11 @@ @bar = addrspace(3) global [4 x i64] undef, align 4 -; SI-LABEL: @store_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 -; SI: s_endpgm +; GCN-LABEL: @store_misaligned64_constant_offsets +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -356,12 +357,12 @@ @bar.large = addrspace(3) global [4096 x i64] undef, align 4 -; SI-LABEL: @store_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm +; GCN-LABEL: @store_misaligned64_constant_large_offsets +; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} +; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 @@ -406,10 +407,10 @@ ret void } -; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4: -; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} -; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} -; CI: s_endpgm +; GCN: {{^}}simple_write2_v4f32_superreg_align4: +; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} +; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in @@ -419,18 +420,11 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.y() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readnone speculatable } attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- test/CodeGen/AMDGPU/function-args.ll +++ test/CodeGen/AMDGPU/function-args.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s ; GCN-LABEL: {{^}}void_func_i1: ; GCN: v_and_b32_e32 v0, 1, v0 @@ -24,7 +24,7 @@ ; GCN-LABEL: {{^}}void_func_i1_signext: ; GCN: s_waitcnt -; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 ; GCN-NOT: v0 ; GCN: buffer_store_dword v0, off define void @void_func_i1_signext(i1 signext %arg0) #0 { @@ -60,7 +60,7 @@ ; GCN-LABEL: {{^}}void_func_i8_zeroext: ; GCN-NOT: and_b32 -; GCN: v_add_i32_e32 v0, vcc, 12, v0 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 { %ext = zext i8 %arg0 to i32 %add = add i32 %ext, 12 @@ -70,7 +70,7 @@ ; GCN-LABEL: {{^}}void_func_i8_signext: ; GCN-NOT: v_bfe_i32 -; GCN: v_add_i32_e32 v0, vcc, 12, v0 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_signext(i8 signext %arg0) #0 { %ext = sext i8 %arg0 to i32 %add = add i32 %ext, 12 @@ -87,7 +87,7 @@ ; GCN-LABEL: {{^}}void_func_i16_zeroext: ; GCN-NOT: v0 -; GCN: v_add_i32_e32 v0, vcc, 12, v0 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 { %ext = zext i16 %arg0 to i32 %add = add i32 %ext, 12 @@ -97,7 +97,7 @@ ; GCN-LABEL: {{^}}void_func_i16_signext: ; GCN-NOT: v0 -; GCN: v_add_i32_e32 v0, vcc, 12, v0 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_signext(i16 signext %arg0) #0 { %ext = sext i16 %arg0 to i32 %add = add i32 %ext, 12 @@ -582,7 +582,7 @@ ; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off ; GCN: buffer_store_byte [[LOAD_ARG2]], off ; GCN: buffer_store_short [[LOAD_ARG3]], off -; VI: buffer_store_short [[LOAD_ARG4]], off +; GFX89 buffer_store_short [[LOAD_ARG4]], off ; CI: buffer_store_short [[CVT_ARG4]], off define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -396,100 +396,100 @@ ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_4:v[0-9]+]], {{(vcc, )?}}4, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_8:v[0-9]+]], {{(vcc, )?}}8, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_12:v[0-9]+]], {{(vcc, )?}}12, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_16:v[0-9]+]], {{(vcc, )?}}16, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_20:v[0-9]+]], {{(vcc, )?}}20, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_24:v[0-9]+]], {{(vcc, )?}}24, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_28:v[0-9]+]], {{(vcc, )?}}28, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_32:v[0-9]+]], {{(vcc, )?}}32, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_36:v[0-9]+]], {{(vcc, )?}}36, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_40:v[0-9]+]], {{(vcc, )?}}40, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_44:v[0-9]+]], {{(vcc, )?}}44, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_48:v[0-9]+]], {{(vcc, )?}}48, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_52:v[0-9]+]], {{(vcc, )?}}52, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_56:v[0-9]+]], {{(vcc, )?}}56, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_60:v[0-9]+]], {{(vcc, )?}}60, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_64:v[0-9]+]], {{(vcc, )?}}64, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_68:v[0-9]+]], {{(vcc, )?}}0x44, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_72:v[0-9]+]], {{(vcc, )?}}0x48, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_76:v[0-9]+]], {{(vcc, )?}}0x4c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_80:v[0-9]+]], {{(vcc, )?}}0x50, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_84:v[0-9]+]], {{(vcc, )?}}0x54, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_88:v[0-9]+]], {{(vcc, )?}}0x58, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_92:v[0-9]+]], {{(vcc, )?}}0x5c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_96:v[0-9]+]], {{(vcc, )?}}0x60, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_100:v[0-9]+]], {{(vcc, )?}}0x64, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_104:v[0-9]+]], {{(vcc, )?}}0x68, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_108:v[0-9]+]], {{(vcc, )?}}0x6c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_112:v[0-9]+]], {{(vcc, )?}}0x70, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_116:v[0-9]+]], {{(vcc, )?}}0x74, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_120:v[0-9]+]], {{(vcc, )?}}0x78, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_124:v[0-9]+]], {{(vcc, )?}}0x7c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_128:v[0-9]+]], {{(vcc, )?}}0x80, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} ; GCN: buffer_load_dword v34 @@ -510,100 +510,100 @@ ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_4:v[0-9]+]], {{(vcc, )?}}4, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_8:v[0-9]+]], {{(vcc, )?}}8, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_12:v[0-9]+]], {{(vcc, )?}}12, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_16:v[0-9]+]], {{(vcc, )?}}16, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_20:v[0-9]+]], {{(vcc, )?}}20, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_24:v[0-9]+]], {{(vcc, )?}}24, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_28:v[0-9]+]], {{(vcc, )?}}28, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_32:v[0-9]+]], {{(vcc, )?}}32, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_36:v[0-9]+]], {{(vcc, )?}}36, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_40:v[0-9]+]], {{(vcc, )?}}40, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_44:v[0-9]+]], {{(vcc, )?}}44, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_48:v[0-9]+]], {{(vcc, )?}}48, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_52:v[0-9]+]], {{(vcc, )?}}52, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_56:v[0-9]+]], {{(vcc, )?}}56, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_60:v[0-9]+]], {{(vcc, )?}}60, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_64:v[0-9]+]], {{(vcc, )?}}64, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_68:v[0-9]+]], {{(vcc, )?}}0x44, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_72:v[0-9]+]], {{(vcc, )?}}0x48, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_76:v[0-9]+]], {{(vcc, )?}}0x4c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_80:v[0-9]+]], {{(vcc, )?}}0x50, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_84:v[0-9]+]], {{(vcc, )?}}0x54, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_88:v[0-9]+]], {{(vcc, )?}}0x58, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_92:v[0-9]+]], {{(vcc, )?}}0x5c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_96:v[0-9]+]], {{(vcc, )?}}0x60, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_100:v[0-9]+]], {{(vcc, )?}}0x64, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_104:v[0-9]+]], {{(vcc, )?}}0x68, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_108:v[0-9]+]], {{(vcc, )?}}0x6c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_112:v[0-9]+]], {{(vcc, )?}}0x70, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_116:v[0-9]+]], {{(vcc, )?}}0x74, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_120:v[0-9]+]], {{(vcc, )?}}0x78, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_124:v[0-9]+]], {{(vcc, )?}}0x7c, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_128:v[0-9]+]], {{(vcc, )?}}0x80, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} ; GCN: buffer_load_dword v34 @@ -623,11 +623,11 @@ ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_128:v[0-9]+]], {{(vcc, )?}}0x80, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} -; GCN-DAG: v_add_i32_e32 [[ADD_256:v[0-9]+]], vcc, 0xfc, v0 +; GCN-DAG: v_add_{{i|u}}32_e32 [[ADD_256:v[0-9]+]], {{(vcc, )?}}0xfc, v0 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_256]], s[0:3], s4 offen{{$}} ; GCN: buffer_load_dword v33 Index: test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -51,7 +51,7 @@ ; GCN: s_bfm_b64 exec, s1, 0 ; GCN: s_cmp_eq_u32 s1, 64 ; GCN: s_cmov_b64 exec, -1 -; GCN: v_add_i32_e32 v0, vcc, s0, v0 +; GCN: v_add_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { main_body: call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) @@ -65,7 +65,7 @@ ; GCN: s_bfm_b64 exec, s1, 0 ; GCN: s_cmp_eq_u32 s1, 64 ; GCN: s_cmov_b64 exec, -1 -; GCN: v_add_i32_e32 v0, vcc, s0, v0 +; GCN: v_add_u32_e32 v0, s0, v0 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { main_body: %s = add i32 %a, %count Index: test/CodeGen/AMDGPU/pack.v2f16.ll =================================================================== --- test/CodeGen/AMDGPU/pack.v2f16.ll +++ test/CodeGen/AMDGPU/pack.v2f16.ll @@ -87,7 +87,7 @@ ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] -; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/pack.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/pack.v2i16.ll +++ test/CodeGen/AMDGPU/pack.v2i16.ll @@ -81,7 +81,7 @@ ; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] -; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]] define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -7,7 +7,8 @@ ; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] ; NOSDWA-NOT: v_add_i32_sdwa -; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %a = load i32, i32 addrspace(1)* %in, align 4 @@ -22,7 +23,8 @@ ; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] ; NOSDWA-NOT: v_subrev_i32_sdwa -; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %a = load i32, i32 addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -1,10 +1,12 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s +; FIXME: Why is this commuted only sometimes? ; GCN-LABEL: {{^}}i32_fastcc_i32_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CIVI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { %add0 = add i32 %arg0, %arg1 @@ -13,7 +15,8 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: v_add_i32_e32 v0, vcc, v1, v +; CIVI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN: s_mov_b32 s5, s32 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 ; GCN: s_waitcnt vmcnt(0) @@ -83,7 +86,10 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 + +; CIVI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 + ; GCN-NEXT: s_setpc_b64 s[30:31] define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 { %arg1.load = load i32, i32* %arg1, align 4 @@ -122,9 +128,16 @@ ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: v_add_i32_e32 v0, vcc, v1, v0 -; GCN: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0 -; GCN: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0 + +; CIVI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CIVI: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0 +; CIVI: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0 + + +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]] +; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]] + ; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { %val_firststack = extractvalue [32 x i32] %large, 30 Index: test/CodeGen/AMDGPU/sminmax.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.ll +++ test/CodeGen/AMDGPU/sminmax.ll @@ -1,6 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_abs_i32: ; GCN: s_abs_i32 @@ -17,9 +18,13 @@ } ; FUNC-LABEL: {{^}}v_abs_i32: -; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] +; SIVI: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] +; GFX9: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SRC:v[0-9]+]] + ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]] -; GCN: v_add_i32 + +; SIVI: v_add_i32 +; GFX9: v_add_u32 ; EG: MAX_INT define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { @@ -33,7 +38,8 @@ } ; GCN-LABEL: {{^}}v_abs_i32_repeat_user: -; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] +; SIVI: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] +; GFX9: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SRC:v[0-9]+]] ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]] ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]] define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { @@ -68,14 +74,20 @@ } ; FUNC-LABEL: {{^}}v_abs_v2i32: -; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] + +; GFX9-DAG: v_sub_u32_e32 [[NEG0:v[0-9]+]], 0, [[SRC0:v[0-9]+]] +; GFX9-DAG: v_sub_u32_e32 [[NEG1:v[0-9]+]], 0, [[SRC1:v[0-9]+]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] -; GCN: v_add_i32 -; GCN: v_add_i32 +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc + +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, ; EG: MAX_INT ; EG: MAX_INT @@ -127,20 +139,30 @@ } ; FUNC-LABEL: {{^}}v_abs_v4i32: -; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] -; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] +; SIVI-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] + +; GFX9-DAG: v_sub_u32_e32 [[NEG0:v[0-9]+]], 0, [[SRC0:v[0-9]+]] +; GFX9-DAG: v_sub_u32_e32 [[NEG1:v[0-9]+]], 0, [[SRC1:v[0-9]+]] +; GFX9-DAG: v_sub_u32_e32 [[NEG2:v[0-9]+]], 0, [[SRC2:v[0-9]+]] +; GFX9-DAG: v_sub_u32_e32 [[NEG3:v[0-9]+]], 0, [[SRC3:v[0-9]+]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]] ; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]] -; GCN: v_add_i32 -; GCN: v_add_i32 -; GCN: v_add_i32 -; GCN: v_add_i32 +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, +; SIVI: v_add_i32_e32 v{{[0-9]+}}, vcc, + +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2, ; EG: MAX_INT ; EG: MAX_INT @@ -181,8 +203,8 @@ } ; FUNC-LABEL: {{^}}v_min_max_i32: -; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[VAL0:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[VAL1:v[0-9]+]] ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICI,SICIVI,SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,VIGFX9,SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9,SIVIGFX9 %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: @@ -194,7 +194,11 @@ ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: BB# -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; + +; SICIVI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; + +; GFX9-NEXT: v_add_u32_e32 [[ADD:v[0-9]+]], 0xfff, v0 +; GFX9-NEXT: buffer_load_dword v{{[0-9]}}, [[ADD]], s[0:3], 0 offen ; define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4095 @@ -204,7 +208,8 @@ ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large: ; GCN-NEXT: BB# -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 +; SICIVI-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x1000, v0 ; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: @@ -240,8 +245,15 @@ ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: BB# -; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +; SICIVI-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +; SICIVI-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 + +; GFX9: buffer_load_dword +; GFX9: buffer_load_dword +; GFX9: buffer_load_dword +; GFX9: buffer_load_dword +; GFX9: buffer_load_dword +; GFX9: buffer_load_dword define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -1,13 +1,33 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone +; FUNC-LABEL: {{^}}s_sub_i32: +; GCN: s_load_dword [[A:s[0-9]+]] +; GCN: s_load_dword [[B:s[0-9]+]] +; GCN: s_sub_i32 s{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %result = sub i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_sub_imm_i32: +; GCN: s_load_dword [[A:s[0-9]+]] +; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]] +define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) { + %result = sub i32 1234, %a + store i32 %result, i32 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}test_sub_i32: ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -17,6 +37,17 @@ ret void } +; FUNC-LABEL: {{^}}test_sub_imm_i32: +; EG: SUB_INT + +; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in + %result = sub i32 123, %a + store i32 %result, i32 addrspace(1)* %out + ret void +} ; FUNC-LABEL: {{^}}test_sub_v2i32: ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -25,6 +56,8 @@ ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in @@ -45,6 +78,10 @@ ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in @@ -95,8 +132,8 @@ } ; FUNC-LABEL: {{^}}s_sub_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 +; GCN: s_sub_u32 +; GCN: s_subb_u32 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG-DAG: SUB_INT {{[* ]*}} @@ -110,8 +147,8 @@ } ; FUNC-LABEL: {{^}}v_sub_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG-DAG: SUB_INT {{[* ]*}} @@ -130,10 +167,10 @@ } ; FUNC-LABEL: {{^}}v_test_sub_v2i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid @@ -146,14 +183,14 @@ } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 +; GCN: v_sub_i32_e32 +; GCN: v_subb_u32_e32 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid