# Changeset View

Changeset View

# Standalone View

Standalone View

# llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Line(s) | |||||

92 | namespace { | 92 | namespace { | ||

93 | enum InstClassEnum { | 93 | enum InstClassEnum { | ||

94 | UNKNOWN, | 94 | UNKNOWN, | ||

95 | DS_READ, | 95 | DS_READ, | ||

96 | DS_WRITE, | 96 | DS_WRITE, | ||

97 | S_BUFFER_LOAD_IMM, | 97 | S_BUFFER_LOAD_IMM, | ||

98 | BUFFER_LOAD, | 98 | BUFFER_LOAD, | ||

99 | BUFFER_STORE, | 99 | BUFFER_STORE, | ||

100 | MIMG, | ||||

100 | }; | 101 | }; | ||

101 | 102 | | |||

102 | enum RegisterEnum { | 103 | enum RegisterEnum { | ||

103 | SBASE = 0x1, | 104 | SBASE = 0x1, | ||

104 | SRSRC = 0x2, | 105 | SRSRC = 0x2, | ||

105 | SOFFSET = 0x4, | 106 | SOFFSET = 0x4, | ||

106 | VADDR = 0x8, | 107 | VADDR = 0x8, | ||

107 | ADDR = 0x10, | 108 | ADDR = 0x10, | ||

109 | SSAMP = 0x20, | ||||

108 | }; | 110 | }; | ||

109 | 111 | | |||

110 | class SILoadStoreOptimizer : public MachineFunctionPass { | 112 | class SILoadStoreOptimizer : public MachineFunctionPass { | ||

111 | struct CombineInfo { | 113 | struct CombineInfo { | ||

112 | MachineBasicBlock::iterator I; | 114 | MachineBasicBlock::iterator I; | ||

113 | MachineBasicBlock::iterator Paired; | 115 | MachineBasicBlock::iterator Paired; | ||

114 | unsigned EltSize; | 116 | unsigned EltSize; | ||

115 | unsigned Offset0; | 117 | unsigned Offset0; | ||

116 | unsigned Offset1; | 118 | unsigned Offset1; | ||

117 | unsigned Width0; | 119 | unsigned Width0; | ||

118 | unsigned Width1; | 120 | unsigned Width1; | ||

119 | unsigned BaseOff; | 121 | unsigned BaseOff; | ||

122 | unsigned DMask0; | ||||

123 | unsigned DMask1; | ||||

120 | InstClassEnum InstClass; | 124 | InstClassEnum InstClass; | ||

121 | bool GLC0; | 125 | bool GLC0; | ||

122 | bool GLC1; | 126 | bool GLC1; | ||

123 | bool SLC0; | 127 | bool SLC0; | ||

124 | bool SLC1; | 128 | bool SLC1; | ||

125 | bool DLC0; | 129 | bool DLC0; | ||

126 | bool DLC1; | 130 | bool DLC1; | ||

127 | bool UseST64; | 131 | bool UseST64; | ||

▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Line(s) | |||||

200 | private: | 204 | private: | ||

201 | const GCNSubtarget *STM = nullptr; | 205 | const GCNSubtarget *STM = nullptr; | ||

202 | const SIInstrInfo *TII = nullptr; | 206 | const SIInstrInfo *TII = nullptr; | ||

203 | const SIRegisterInfo *TRI = nullptr; | 207 | const SIRegisterInfo *TRI = nullptr; | ||

204 | MachineRegisterInfo *MRI = nullptr; | 208 | MachineRegisterInfo *MRI = nullptr; | ||

205 | AliasAnalysis *AA = nullptr; | 209 | AliasAnalysis *AA = nullptr; | ||

206 | bool OptimizeAgain; | 210 | bool OptimizeAgain; | ||

207 | 211 | | |||

212 | static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); | ||||

208 | static bool offsetsCanBeCombined(CombineInfo &CI); | 213 | static bool offsetsCanBeCombined(CombineInfo &CI); | ||

209 | static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); | 214 | static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); | ||

210 | static unsigned getNewOpcode(const CombineInfo &CI); | 215 | static unsigned getNewOpcode(const CombineInfo &CI); | ||

211 | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); | 216 | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); | ||

212 | const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); | 217 | const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); | ||

213 | 218 | | |||

214 | bool findMatchingInst(CombineInfo &CI); | 219 | bool findMatchingInst(CombineInfo &CI); | ||

215 | 220 | | |||

216 | unsigned read2Opcode(unsigned EltSize) const; | 221 | unsigned read2Opcode(unsigned EltSize) const; | ||

217 | unsigned read2ST64Opcode(unsigned EltSize) const; | 222 | unsigned read2ST64Opcode(unsigned EltSize) const; | ||

218 | MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); | 223 | MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); | ||

219 | 224 | | |||

220 | unsigned write2Opcode(unsigned EltSize) const; | 225 | unsigned write2Opcode(unsigned EltSize) const; | ||

221 | unsigned write2ST64Opcode(unsigned EltSize) const; | 226 | unsigned write2ST64Opcode(unsigned EltSize) const; | ||

222 | MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); | 227 | MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); | ||

228 | MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI); | ||||

223 | MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); | 229 | MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); | ||

224 | MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); | 230 | MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); | ||

225 | MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); | 231 | MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); | ||

226 | 232 | | |||

227 | void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, | 233 | void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, | ||

228 | int32_t NewOffset) const; | 234 | int32_t NewOffset) const; | ||

229 | unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; | 235 | unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; | ||

230 | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; | 236 | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; | ||

Show All 37 Lines | |||||

268 | 274 | | |||

269 | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { | 275 | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { | ||

270 | const unsigned Opc = MI.getOpcode(); | 276 | const unsigned Opc = MI.getOpcode(); | ||

271 | 277 | | |||

272 | if (TII.isMUBUF(Opc)) { | 278 | if (TII.isMUBUF(Opc)) { | ||

273 | // FIXME: Handle d16 correctly | 279 | // FIXME: Handle d16 correctly | ||

274 | return AMDGPU::getMUBUFElements(Opc); | 280 | return AMDGPU::getMUBUFElements(Opc); | ||

275 | } | 281 | } | ||

282 | if (TII.isMIMG(MI)) { | ||||

283 | uint64_t DMaskImm = | ||||

284 | TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); | ||||

285 | return countPopulation(DMaskImm); | ||||

286 | } | ||||

276 | 287 | | |||

277 | switch (Opc) { | 288 | switch (Opc) { | ||

278 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | 289 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||

279 | return 1; | 290 | return 1; | ||

280 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | 291 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||

281 | return 2; | 292 | return 2; | ||

282 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | 293 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||

283 | return 4; | 294 | return 4; | ||

Show All 17 Lines | 311 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: | |||

301 | return BUFFER_LOAD; | 312 | return BUFFER_LOAD; | ||

302 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: | 313 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: | ||

303 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: | 314 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: | ||

304 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: | 315 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: | ||

305 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: | 316 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: | ||

306 | return BUFFER_STORE; | 317 | return BUFFER_STORE; | ||

307 | } | 318 | } | ||

308 | } | 319 | } | ||

320 | if (TII.isMIMG(Opc)) { | ||||

321 | // Ignore instructions encoded without vaddr. | ||||

322 | if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) | ||||

323 | return UNKNOWN; | ||||

324 | // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. | ||||

325 | if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) | ||||

326 | return UNKNOWN; | ||||

327 | return MIMG; | ||||

328 | } | ||||

309 | return UNKNOWN; | 329 | return UNKNOWN; | ||

310 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | 330 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||

311 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | 331 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||

312 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | 332 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||

313 | return S_BUFFER_LOAD_IMM; | 333 | return S_BUFFER_LOAD_IMM; | ||

314 | case AMDGPU::DS_READ_B32: | 334 | case AMDGPU::DS_READ_B32: | ||

315 | case AMDGPU::DS_READ_B32_gfx9: | 335 | case AMDGPU::DS_READ_B32_gfx9: | ||

316 | case AMDGPU::DS_READ_B64: | 336 | case AMDGPU::DS_READ_B64: | ||

Show All 9 Lines | |||||

326 | 346 | | |||

327 | /// Determines instruction subclass from opcode. Only instructions | 347 | /// Determines instruction subclass from opcode. Only instructions | ||

328 | /// of the same subclass can be merged together. | 348 | /// of the same subclass can be merged together. | ||

329 | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { | 349 | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { | ||

330 | switch (Opc) { | 350 | switch (Opc) { | ||

331 | default: | 351 | default: | ||

332 | if (TII.isMUBUF(Opc)) | 352 | if (TII.isMUBUF(Opc)) | ||

333 | return AMDGPU::getMUBUFBaseOpcode(Opc); | 353 | return AMDGPU::getMUBUFBaseOpcode(Opc); | ||

354 | if (TII.isMIMG(Opc)) { | ||||

355 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); | ||||

356 | assert(Info); | ||||

357 | return Info->BaseOpcode; | ||||

358 | } | ||||

334 | return -1; | 359 | return -1; | ||

335 | case AMDGPU::DS_READ_B32: | 360 | case AMDGPU::DS_READ_B32: | ||

336 | case AMDGPU::DS_READ_B32_gfx9: | 361 | case AMDGPU::DS_READ_B32_gfx9: | ||

337 | case AMDGPU::DS_READ_B64: | 362 | case AMDGPU::DS_READ_B64: | ||

338 | case AMDGPU::DS_READ_B64_gfx9: | 363 | case AMDGPU::DS_READ_B64_gfx9: | ||

339 | case AMDGPU::DS_WRITE_B32: | 364 | case AMDGPU::DS_WRITE_B32: | ||

340 | case AMDGPU::DS_WRITE_B32_gfx9: | 365 | case AMDGPU::DS_WRITE_B32_gfx9: | ||

341 | case AMDGPU::DS_WRITE_B64: | 366 | case AMDGPU::DS_WRITE_B64: | ||

Show All 20 Lines | 377 | if (TII.isMUBUF(Opc)) { | |||

362 | 387 | | |||

363 | if (AMDGPU::getMUBUFHasSoffset(Opc)) { | 388 | if (AMDGPU::getMUBUFHasSoffset(Opc)) { | ||

364 | result |= SOFFSET; | 389 | result |= SOFFSET; | ||

365 | } | 390 | } | ||

366 | 391 | | |||

367 | return result; | 392 | return result; | ||

368 | } | 393 | } | ||

369 | 394 | | |||

395 | if (TII.isMIMG(Opc)) { | ||||

396 | unsigned result = VADDR | SRSRC; | ||||

397 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); | ||||

398 | if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) | ||||

399 | result |= SSAMP; | ||||

400 | return result; | ||||

401 | } | ||||

402 | | ||||

370 | switch (Opc) { | 403 | switch (Opc) { | ||

371 | default: | 404 | default: | ||

372 | return 0; | 405 | return 0; | ||

373 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | 406 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||

374 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | 407 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||

375 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | 408 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||

376 | return SBASE; | 409 | return SBASE; | ||

377 | case AMDGPU::DS_READ_B32: | 410 | case AMDGPU::DS_READ_B32: | ||

Show All 33 Lines | 423 | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, | |||

411 | case S_BUFFER_LOAD_IMM: | 444 | case S_BUFFER_LOAD_IMM: | ||

412 | EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); | 445 | EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); | ||

413 | break; | 446 | break; | ||

414 | default: | 447 | default: | ||

415 | EltSize = 4; | 448 | EltSize = 4; | ||

416 | break; | 449 | break; | ||

417 | } | 450 | } | ||

418 | 451 | | |||

419 | int OffsetIdx = | 452 | if (InstClass == MIMG) { | ||

420 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); | 453 | DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); | ||

454 | } else { | ||||

455 | int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); | ||||

421 | Offset0 = I->getOperand(OffsetIdx).getImm(); | 456 | Offset0 = I->getOperand(OffsetIdx).getImm(); | ||

457 | } | ||||

458 | | ||||

422 | Width0 = getOpcodeWidth(*I, TII); | 459 | Width0 = getOpcodeWidth(*I, TII); | ||

423 | 460 | | |||

424 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | 461 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | ||

425 | Offset0 &= 0xffff; | 462 | Offset0 &= 0xffff; | ||

426 | } else { | 463 | } else if (InstClass != MIMG) { | ||

427 | GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | 464 | GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | ||

428 | if (InstClass != S_BUFFER_LOAD_IMM) { | 465 | if (InstClass != S_BUFFER_LOAD_IMM) { | ||

429 | SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | 466 | SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | ||

430 | } | 467 | } | ||

431 | DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | 468 | DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | ||

432 | } | 469 | } | ||

433 | 470 | | |||

434 | unsigned AddrOpName[5] = {0}; | 471 | unsigned AddrOpName[5] = {0}; | ||

Show All 15 Lines | |||||

450 | if (Regs & SOFFSET) { | 487 | if (Regs & SOFFSET) { | ||

451 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | 488 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | ||

452 | } | 489 | } | ||

453 | 490 | | |||

454 | if (Regs & VADDR) { | 491 | if (Regs & VADDR) { | ||

455 | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | 492 | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | ||

456 | } | 493 | } | ||

457 | 494 | | |||

495 | if (Regs & SSAMP) { | ||||

496 | AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; | ||||

497 | } | ||||

498 | | ||||

458 | for (unsigned i = 0; i < NumAddresses; i++) { | 499 | for (unsigned i = 0; i < NumAddresses; i++) { | ||

459 | AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | 500 | AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | ||

460 | AddrReg[i] = &I->getOperand(AddrIdx[i]); | 501 | AddrReg[i] = &I->getOperand(AddrIdx[i]); | ||

461 | } | 502 | } | ||

462 | 503 | | |||

463 | InstsToMove.clear(); | 504 | InstsToMove.clear(); | ||

464 | } | 505 | } | ||

465 | 506 | | |||

466 | void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | 507 | void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | ||

467 | const SIInstrInfo &TII) { | 508 | const SIInstrInfo &TII) { | ||

468 | Paired = MI; | 509 | Paired = MI; | ||

469 | assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | 510 | assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | ||

511 | | ||||

512 | if (InstClass == MIMG) { | ||||

513 | DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm(); | ||||

514 | } else { | ||||

470 | int OffsetIdx = | 515 | int OffsetIdx = | ||

471 | AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | 516 | AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | ||

472 | Offset1 = Paired->getOperand(OffsetIdx).getImm(); | 517 | Offset1 = Paired->getOperand(OffsetIdx).getImm(); | ||

518 | } | ||||

519 | | ||||

473 | Width1 = getOpcodeWidth(*Paired, TII); | 520 | Width1 = getOpcodeWidth(*Paired, TII); | ||

474 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | 521 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | ||

475 | Offset1 &= 0xffff; | 522 | Offset1 &= 0xffff; | ||

476 | } else { | 523 | } else if (InstClass != MIMG) { | ||

477 | GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | 524 | GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | ||

478 | if (InstClass != S_BUFFER_LOAD_IMM) { | 525 | if (InstClass != S_BUFFER_LOAD_IMM) { | ||

479 | SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | 526 | SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | ||

480 | } | 527 | } | ||

481 | DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | 528 | DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | ||

482 | } | 529 | } | ||

483 | } | 530 | } | ||

484 | 531 | | |||

▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Line(s) | 625 | static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, | |||

583 | // This function adds the offset parameter to the existing offset for A, | 630 | // This function adds the offset parameter to the existing offset for A, | ||

584 | // so we pass 0 here as the offset and then manually set it to the correct | 631 | // so we pass 0 here as the offset and then manually set it to the correct | ||

585 | // value after the call. | 632 | // value after the call. | ||

586 | MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | 633 | MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | ||

587 | MMO->setOffset(MinOffset); | 634 | MMO->setOffset(MinOffset); | ||

588 | return MMO; | 635 | return MMO; | ||

589 | } | 636 | } | ||

590 | 637 | | |||

638 | bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { | ||||

639 | assert(CI.InstClass == MIMG); | ||||

640 | | ||||

641 | // Ignore instructions with tfe/lwe set. | ||||

642 | const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); | ||||

643 | const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); | ||||

644 | | ||||

645 | if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) | ||||

646 | return false; | ||||

647 | | ||||

648 | // Check other optional immediate operands for equality. | ||||

649 | unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, | ||||

650 | AMDGPU::OpName::d16, AMDGPU::OpName::unorm, | ||||

651 | AMDGPU::OpName::da, AMDGPU::OpName::r128}; | ||||

652 | | ||||

653 | for (auto op : OperandsToMatch) { | ||||

654 | int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); | ||||

655 | if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx) | ||||

656 | return false; | ||||

657 | if (Idx != -1 && | ||||

658 | CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm()) | ||||

659 | return false; | ||||

660 | } | ||||

661 | | ||||

662 | // Check DMask for overlaps. | ||||

663 | unsigned MaxMask = std::max(CI.DMask0, CI.DMask1); | ||||

664 | unsigned MinMask = std::min(CI.DMask0, CI.DMask1); | ||||

665 | | ||||

666 | unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); | ||||

667 | if ((1u << AllowedBitsForMin) <= MinMask) | ||||

668 | return false; | ||||

669 | | ||||

670 | return true; | ||||

671 | } | ||||

672 | | ||||

591 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | 673 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | ||

674 | assert(CI.InstClass != MIMG); | ||||

675 | | ||||

592 | // XXX - Would the same offset be OK? Is there any reason this would happen or | 676 | // XXX - Would the same offset be OK? Is there any reason this would happen or | ||

593 | // be useful? | 677 | // be useful? | ||

594 | if (CI.Offset0 == CI.Offset1) | 678 | if (CI.Offset0 == CI.Offset1) | ||

595 | return false; | 679 | return false; | ||

596 | 680 | | |||

597 | // This won't be valid if the offset isn't aligned. | 681 | // This won't be valid if the offset isn't aligned. | ||

598 | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | 682 | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | ||

599 | return false; | 683 | return false; | ||

▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Line(s) | 822 | if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, | |||

739 | CI.InstsToMove)) | 823 | CI.InstsToMove)) | ||

740 | continue; | 824 | continue; | ||

741 | 825 | | |||

742 | bool Match = CI.hasSameBaseAddress(*MBBI); | 826 | bool Match = CI.hasSameBaseAddress(*MBBI); | ||

743 | 827 | | |||

744 | if (Match) { | 828 | if (Match) { | ||

745 | CI.setPaired(MBBI, *TII); | 829 | CI.setPaired(MBBI, *TII); | ||

746 | 830 | | |||

747 | // Check both offsets fit in the reduced range. | 831 | // Check both offsets (or masks for MIMG) can be combined and fit in the | ||

832 | // reduced range. | ||||

833 | bool canBeCombined = | ||||

834 | CI.InstClass == MIMG | ||||

835 | ? dmasksCanBeCombined(CI, *TII) | ||||

836 | : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); | ||||

837 | | ||||

748 | // We also need to go through the list of instructions that we plan to | 838 | // We also need to go through the list of instructions that we plan to | ||

749 | // move and make sure they are all safe to move down past the merged | 839 | // move and make sure they are all safe to move down past the merged | ||

750 | // instruction. | 840 | // instruction. | ||

751 | if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) | 841 | if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | ||

752 | if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | | |||

753 | return true; | 842 | return true; | ||

754 | } | 843 | } | ||

755 | 844 | | |||

756 | // We've found a load/store that we couldn't merge for some reason. | 845 | // We've found a load/store that we couldn't merge for some reason. | ||

757 | // We could potentially keep looking, but we'd need to make sure that | 846 | // We could potentially keep looking, but we'd need to make sure that | ||

758 | // it was safe to move I and also all the instruction in InstsToMove | 847 | // it was safe to move I and also all the instruction in InstsToMove | ||

759 | // down past this instruction. | 848 | // down past this instruction. | ||

760 | // check if we can move I across MBBI and if we can move all I's users | 849 | // check if we can move I across MBBI and if we can move all I's users | ||

▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Line(s) | 971 | SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { | |||

941 | CI.I->eraseFromParent(); | 1030 | CI.I->eraseFromParent(); | ||

942 | CI.Paired->eraseFromParent(); | 1031 | CI.Paired->eraseFromParent(); | ||

943 | 1032 | | |||

944 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | 1033 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | ||

945 | return Write2; | 1034 | return Write2; | ||

946 | } | 1035 | } | ||

947 | 1036 | | |||

948 | MachineBasicBlock::iterator | 1037 | MachineBasicBlock::iterator | ||

1038 | SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { | ||||

1039 | MachineBasicBlock *MBB = CI.I->getParent(); | ||||

1040 | DebugLoc DL = CI.I->getDebugLoc(); | ||||

1041 | const unsigned Opcode = getNewOpcode(CI); | ||||

1042 | | ||||

1043 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | ||||

1044 | | ||||

1045 | Register DestReg = MRI->createVirtualRegister(SuperRC); | ||||

1046 | unsigned MergedDMask = CI.DMask0 | CI.DMask1; | ||||

1047 | unsigned DMaskIdx = | ||||

1048 | AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); | ||||

1049 | | ||||

1050 | auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); | ||||

1051 | for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { | ||||

1052 | if (I == DMaskIdx) | ||||

1053 | MIB.addImm(MergedDMask); | ||||

1054 | else | ||||

1055 | MIB.add((*CI.I).getOperand(I)); | ||||

1056 | } | ||||

1057 | | ||||

1058 | // It shouldn't be possible to get this far if the two instructions | ||||

1059 | // don't have a single memoperand, because MachineInstr::mayAlias() | ||||

1060 | // will return true if this is the case. | ||||

1061 | assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); | ||||

1062 | | ||||

1063 | const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); | ||||

1064 | const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); | ||||

1065 | | ||||

1066 | MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); | ||||

1067 | | ||||

1068 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); | ||||

1069 | const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); | ||||

1070 | const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); | ||||

1071 | | ||||

1072 | // Copy to the old destination registers. | ||||

1073 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | ||||

1074 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); | ||||

1075 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); | ||||

1076 | | ||||

1077 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) | ||||

1078 | .add(*Dest0) // Copy to same destination including flags and sub reg. | ||||

1079 | .addReg(DestReg, 0, SubRegIdx0); | ||||

1080 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | ||||

1081 | .add(*Dest1) | ||||

1082 | .addReg(DestReg, RegState::Kill, SubRegIdx1); | ||||

1083 | | ||||

1084 | moveInstsAfter(Copy1, CI.InstsToMove); | ||||

1085 | | ||||

1086 | CI.I->eraseFromParent(); | ||||

1087 | CI.Paired->eraseFromParent(); | ||||

1088 | return New; | ||||

1089 | } | ||||

1090 | | ||||

1091 | MachineBasicBlock::iterator | ||||

949 | SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | 1092 | SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | ||

950 | MachineBasicBlock *MBB = CI.I->getParent(); | 1093 | MachineBasicBlock *MBB = CI.I->getParent(); | ||

951 | DebugLoc DL = CI.I->getDebugLoc(); | 1094 | DebugLoc DL = CI.I->getDebugLoc(); | ||

952 | const unsigned Opcode = getNewOpcode(CI); | 1095 | const unsigned Opcode = getNewOpcode(CI); | ||

953 | 1096 | | |||

954 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | 1097 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | ||

955 | 1098 | | |||

956 | Register DestReg = MRI->createVirtualRegister(SuperRC); | 1099 | Register DestReg = MRI->createVirtualRegister(SuperRC); | ||

▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Line(s) | 1214 | case S_BUFFER_LOAD_IMM: | |||

1072 | switch (Width) { | 1215 | switch (Width) { | ||

1073 | default: | 1216 | default: | ||

1074 | return 0; | 1217 | return 0; | ||

1075 | case 2: | 1218 | case 2: | ||

1076 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | 1219 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | ||

1077 | case 4: | 1220 | case 4: | ||

1078 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | 1221 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | ||

1079 | } | 1222 | } | ||

1223 | case MIMG: | ||||

1224 | assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width)); | ||||

1225 | return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); | ||||

1080 | } | 1226 | } | ||

1081 | } | 1227 | } | ||

1082 | 1228 | | |||

1083 | std::pair<unsigned, unsigned> | 1229 | std::pair<unsigned, unsigned> | ||

1084 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | 1230 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | ||

1085 | 1231 | | |||

1086 | if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | 1232 | if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | ||

1087 | return std::make_pair(0, 0); | 1233 | return std::make_pair(0, 0); | ||

1088 | 1234 | | |||

1089 | bool ReverseOrder = CI.Offset0 > CI.Offset1; | 1235 | bool ReverseOrder; | ||

1236 | if (CI.InstClass == MIMG) { | ||||

1237 | assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) && | ||||

1238 | "No overlaps"); | ||||

1239 | ReverseOrder = CI.DMask0 > CI.DMask1; | ||||

1240 | } else | ||||

1241 | ReverseOrder = CI.Offset0 > CI.Offset1; | ||||

1090 | 1242 | | |||

1091 | static const unsigned Idxs[4][4] = { | 1243 | static const unsigned Idxs[4][4] = { | ||

1092 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | 1244 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | ||

1093 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | 1245 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | ||

1094 | {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | 1246 | {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | ||

1095 | {AMDGPU::sub3, 0, 0, 0}, | 1247 | {AMDGPU::sub3, 0, 0, 0}, | ||

1096 | }; | 1248 | }; | ||

1097 | unsigned Idx0; | 1249 | unsigned Idx0; | ||

▲ Show 20 Lines • Show All 552 Lines • ▼ Show 20 Line(s) | 1801 | case BUFFER_STORE: | |||

1650 | if (findMatchingInst(CI)) { | 1802 | if (findMatchingInst(CI)) { | ||

1651 | Modified = true; | 1803 | Modified = true; | ||

1652 | removeCombinedInst(MergeList, *CI.Paired); | 1804 | removeCombinedInst(MergeList, *CI.Paired); | ||

1653 | MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | 1805 | MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | ||

1654 | CI.setMI(NewMI, *TII, *STM); | 1806 | CI.setMI(NewMI, *TII, *STM); | ||

1655 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | 1807 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | ||

1656 | } | 1808 | } | ||

1657 | break; | 1809 | break; | ||

1810 | case MIMG: | ||||

1811 | if (findMatchingInst(CI)) { | ||||

1812 | Modified = true; | ||||

1813 | removeCombinedInst(MergeList, *CI.Paired); | ||||

1814 | MachineBasicBlock::iterator NewMI = mergeImagePair(CI); | ||||

1815 | CI.setMI(NewMI, *TII, *STM); | ||||

1816 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | ||||

1817 | } | ||||

1818 | break; | ||||

1658 | } | 1819 | } | ||

1659 | // Clear the InstsToMove after we have finished searching so we don't have | 1820 | // Clear the InstsToMove after we have finished searching so we don't have | ||

1660 | // stale values left over if we search for this CI again in another pass | 1821 | // stale values left over if we search for this CI again in another pass | ||

1661 | // over the block. | 1822 | // over the block. | ||

1662 | CI.InstsToMove.clear(); | 1823 | CI.InstsToMove.clear(); | ||

1663 | } | 1824 | } | ||

1664 | 1825 | | |||

1665 | return Modified; | 1826 | return Modified; | ||

Show All 35 Lines |