Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -350,10 +350,8 @@ // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: case ISD::ADDC: case ISD::ADDE: - case ISD::SUB: case ISD::SUBC: case ISD::SUBE: { if (N->getValueType(0) != MVT::i64 || Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2127,13 +2127,57 @@ } switch (MI.getOpcode()) { - case AMDGPU::SI_INIT_M0: + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, + Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, + Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); + + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, + Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, + Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); + + bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); + + unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .add(Src0Sub0) + .add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .add(Src0Sub1) + .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_INIT_M0: { BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(MI.getOperand(0)); MI.eraseFromParent(); return BB; - + } case AMDGPU::SI_INIT_EXEC: // This should be before all vector instructions. BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -44,6 +44,7 @@ static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); +public: unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, @@ -56,7 +57,7 @@ const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const; - +private: void swapOperands(MachineInstr &Inst) const; void lowerScalarAbs(SetVectorType &Worklist, @@ -65,6 +66,9 @@ void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; + void splitScalar64BitAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const; + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3423,6 +3423,11 @@ switch (Opcode) { default: break; + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + splitScalar64BitAddSub(Worklist, Inst); + Inst.eraseFromParent(); + continue; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst.eraseFromParent(); @@ -3685,6 +3690,74 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +void SIInstrInfo::splitScalar64BitAddSub( + SetVectorType &Worklist, MachineInstr &Inst) const { + bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub0, Src1SubRC); + + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); + + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) + .addReg(CarryReg, RegState::Define) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); + + unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1) + .addReg(CarryReg, RegState::Kill); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*LoHalf); + legalizeOperands(*HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitBinaryOp( SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -111,12 +111,31 @@ let usesCustomInserter = 1; } -// 64-bit vector move instruction. This is mainly used by the SIFoldOperands -// pass to enable folding of inline immediates. +// 64-bit vector move instruction. This is mainly used by the +// SIFoldOperands pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; + } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] + +let usesCustomInserter = 1, Defs = [SCC] in { +def S_ADD_U64_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))] +>; + +def S_SUB_U64_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))] +>; + +def S_ADDC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)>; +def S_SUBC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)>; +} // End usesCustomInserter = 1, Defs = [SCC] + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -163,10 +163,10 @@ ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}v_clamp_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] @@ -397,7 +397,8 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid Index: test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop.ll +++ test/CodeGen/AMDGPU/ctpop.ll @@ -56,8 +56,8 @@ %tid = call i32 @llvm.r600.read.tidig.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid - %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4 - %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4 + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4 + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone %add = add i32 %ctpop0, %ctpop1 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -260,13 +260,15 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_1: ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] -; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -344,13 +346,16 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_1: ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] -; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]] + +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -421,10 +426,11 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 + ; GCN: flat_load_dword [[IDX:v[0-9]+]] ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -448,11 +454,12 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: -; GCN: flat_load_dword [[IDX:v[0-9]+]] -; GCN: flat_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; GCN: flat_load_dword [[IDX:v[0-9]+]] +; GCN: flat_load_dword [[VEC:v[0-9]+]] + ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -360,7 +360,6 @@ ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 @@ -371,38 +370,38 @@ ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 -; GCN-DAG: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 + +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 + +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 + +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 @@ -421,29 +420,73 @@ ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 + +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 +; GCN-NOHSA: v_ashrrev_i32 ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-NOHSA: buffer_store_dwordx4 + ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 + +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 ; GCN-HSA: flat_store_dwordx4 +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: v_ashrrev_i32 +; GCN-HSA: flat_store_dwordx4 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = sext <32 x i32> %ld to <32 x i64> Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,10 +8,10 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs Index: test/CodeGen/AMDGPU/mul.ll =================================================================== --- test/CodeGen/AMDGPU/mul.ll +++ test/CodeGen/AMDGPU/mul.ll @@ -207,20 +207,21 @@ ; SI: s_load_dwordx2 ; SI: v_mul_hi_u32 -; SI: v_mul_hi_u32 ; SI: s_mul_i32 ; SI: v_mul_hi_u32 ; SI: s_mul_i32 -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: s_mul_i32 -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_u32 ; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: s_mul_i32 + +; SI: v_mul_hi_u32 +; SI: v_mul_hi_u32 +; SI: v_mul_hi_u32 +; SI: s_mul_i32 +; SI: s_mul_i32 +; SI: v_mul_hi_u32 +; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: buffer_store_dwordx4 @@ -242,7 +243,7 @@ ; SI-DAG: v_mul_hi_u32 ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_lo_i32 -; SI: v_add_i32_e32 +; SI-DAG: v_add_i32_e32 ; SI-DAG: v_mul_hi_u32 ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_u32 Index: test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -7,7 +7,7 @@ ; SI: NumVgprs: {{[1-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[1-5][0-9]$}} +; VI: NumSgprs: {{[0-9]$}} ; VI: NumVgprs: {{[1-3][0-9]$}} define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { Index: test/CodeGen/AMDGPU/split-scalar-i64-add.ll =================================================================== --- test/CodeGen/AMDGPU/split-scalar-i64-add.ll +++ test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -7,8 +7,11 @@ ; set in vcc, which is undefined since the low scalar half add sets ; scc instead. +; FIXME: SIShrinkInstructions should force immediate fold. + ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: -; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}} +; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f +; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}} ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { %v.val = load volatile i32, i32 addrspace(1)* %in Index: test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.i16.ll +++ test/CodeGen/AMDGPU/sub.i16.ll @@ -85,9 +85,9 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: -; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -57,7 +57,7 @@ ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %a = load i16, i16 addrspace(1)* %in + %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr %result = sub i16 %a, %b store i16 %result, i16 addrspace(1)* %out @@ -71,7 +71,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 - %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr %result = sub <2 x i16> %a, %b store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 - %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr %result = sub <4 x i16> %a, %b store <4 x i16> %result, <4 x i16> addrspace(1)* %out @@ -146,13 +146,13 @@ } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone Index: test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.v2i16.ll +++ test/CodeGen/AMDGPU/sub.v2i16.ll @@ -160,10 +160,11 @@ ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] + ; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and