Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -496,6 +496,7 @@ // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); + addPass(&SILoadStoreOptimizerID); } void GCNPassConfig::addIRPasses() { @@ -539,18 +540,6 @@ insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - - // FIXME: Move pre-RA and remove extra reg coalescer run. - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - insertPass(&MachineSchedulerID, &RegisterCoalescerID); - } - addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -60,31 +60,35 @@ const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; - LiveIntervals *LIS; + AliasAnalysis *AA; static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); + MachineBasicBlock::iterator findMatchingDSInst( + MachineBasicBlock::iterator I, + unsigned EltSize, + std::vector &InstsToMove); MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef InstsToMove); MachineBasicBlock::iterator mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef InstsToMove); public: static char ID; SILoadStoreOptimizer() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} + AA(nullptr) {} SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); @@ -100,10 +104,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -113,9 +114,7 @@ INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -127,6 +126,40 @@ return new SILoadStoreOptimizer(TM); } +static void moveInstsAfter(MachineBasicBlock::iterator I, + ArrayRef InstsToMove) { + MachineBasicBlock *MBB = I->getParent(); + ++I; + for (MachineInstr *MI : InstsToMove) { + MI->removeFromParent(); + MBB->insert(I, MI); + } +} + +static void addDefsToList(const MachineInstr &MI, + SmallVectorImpl &Defs) { + for (const MachineOperand &Def : MI.defs()) { + Defs.push_back(&Def); + } +} + +static bool +canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef InstsToMove, + const TargetInstrInfo *TII, + AliasAnalysis *AA) { + + assert(MemOp.mayLoad() || MemOp.mayStore()); + + for (MachineInstr *InstToMove : InstsToMove) { + if (!InstToMove->mayLoad() && !InstToMove->mayStore()) + continue; + if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) + return false; + } + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned Size) { @@ -156,44 +189,98 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ + unsigned EltSize, + std::vector &InstsToMove){ MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock &MBB = *I->getParent(); MachineBasicBlock::iterator MBBI = I; ++MBBI; - if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } + SmallVector DefsToMove; + addDefsToList(*I, DefsToMove); + + for ( ; MBBI != E; ++MBBI) { + + if (MBBI->getOpcode() != I->getOpcode()) { + + // This is not a matching DS instruction, but we can keep looking as + // long as one of these conditions are met: + // 1. It is safe to move I down past MBBI. + // 2. It is safe to move MBBI down past the instruction that I will + // be merged into. + + if (MBBI->hasUnmodeledSideEffects()) + // We can't re-order this instruction with respect to other memory + // opeations, so we fail both conditions mentioned above. + return E; + + if ((MBBI->mayLoad() || MBBI->mayStore()) && + !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { + // We fail condition #1, but we may still be able to satisfy condition + // #2. Add this instruction to the move list and then we will check + // if condition #2 holds once we have selected the matching instruction. + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + continue; + } + + // When we match I with another DS instruction we will be moving I down + // to the location of the matched instruction any uses of I will need to + // be moved down as well. + for (const MachineOperand *Def : DefsToMove) { + bool ReadDef = MBBI->readsVirtualRegister(Def->getReg()); + // If ReadDef is true, then there is a use of Def between I + // and the instruction that I will potentially be merged with. We + // will need to move this instruction after the merged instructions. + if (ReadDef) { + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + break; + } + } + continue; + } + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + // We also need to go through the list of instructions that we plan to + // move and make sure they are all safe to move down past the merged + // instruction. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && + canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) + return MBBI; + } + + // We've found a load/store that we couldn't merge for some reason. + // We could potentially keep looking, but we'd need to make sure that + // it was safe to move I and also all the instruction in InstsToMove + // down past this instruction. + // FIXME: This is too conservative. + break; + } return E; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -242,7 +329,7 @@ DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 @@ -253,48 +340,28 @@ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(*Read2); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - // The new write to the original destination register is now the copy. Steal - // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(*I, *Copy0); - LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); + moveInstsAfter(Copy1, InstsToMove); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->createAndComputeVirtRegInterval(DestReg); - - if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); - M0Segment->end = Read2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); + return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRefInstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -336,15 +403,8 @@ const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) + = BuildMI(*MBB, Paired, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 @@ -354,24 +414,14 @@ .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + moveInstsAfter(Write2, InstsToMove); - LIS->RemoveMachineInstrFromMaps(*I); - LIS->RemoveMachineInstrFromMaps(*Paired); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); + return Next; } // Scan through looking for adjacent LDS operations with constant offsets from @@ -389,13 +439,15 @@ continue; } + std::vector InstsToMove; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeRead2Pair(I, Match, Size); + I = mergeRead2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -403,10 +455,11 @@ continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeWrite2Pair(I, Match, Size); + I = mergeWrite2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -432,13 +485,10 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - - LIS = &getAnalysis(); + AA = &getAnalysis().getAAResults(); DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); - assert(!MRI->isSSA()); - bool Modified = false; for (MachineBasicBlock &MBB : MF) Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -9,8 +9,8 @@ ; SI-LABEL: {{^}}offset_order: ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 define void @offset_order(float addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -179,8 +179,8 @@ } ; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -209,8 +209,8 @@ } ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -13,8 +13,8 @@ ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] +; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -25,7 +25,8 @@ ; SI: v_rsq_clamp_f64_e32 ; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 +; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]] ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -65,8 +65,8 @@ } ; FUNC-LABEL: {{^}}local_load_v16i16: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} ; EG: LDS_READ_RET @@ -237,11 +237,9 @@ ret void } -; FIXME: Should have 2 ds_read_b64 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN: ds_write2_b64 ; GCN: ds_write2_b64 @@ -255,9 +253,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -278,10 +275,18 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -289,17 +294,31 @@ ret void } -; FIXME: Missed read2 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = zext <64 x i16> %load to <64 x i32> Index: test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i32.ll +++ test/CodeGen/AMDGPU/load-local-i32.ll @@ -56,10 +56,14 @@ } ; FUNC-LABEL: {{^}}local_load_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in Index: test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -44,8 +44,7 @@ ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} +; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -156,7 +156,8 @@ ; FUNC-LABEL: @reorder_local_offsets ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 ; CI: buffer_store_dword ; CI: s_endpgm Index: test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- test/CodeGen/AMDGPU/store-v3i64.ll +++ test/CodeGen/AMDGPU/store-v3i64.ll @@ -46,8 +46,7 @@ } ; GCN-LABEL: {{^}}local_store_v3i64: -; GCN: ds_write_b64 -; GCN: ds_write_b64 +; GCN: ds_write2_b64 ; GCN: ds_write_b64 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -42,12 +42,10 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} - ; GCN: buffer_load_dword [[VA0:v[0-9]+]] -; GCN-NOT: v_mov_b32 ; GCN: buffer_load_dword [[VA1:v[0-9]+]] +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]