Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -193,6 +193,8 @@ /// equivalent opcode that writes \p Channels Channels. int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; + /// Return the equivalent opcode that loads \p Size dwords. + int getResizedLoadOp(uint16_t Opcode, unsigned Size) const; }; namespace AMDGPU { Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -323,6 +323,19 @@ } } +int AMDGPUInstrInfo::getResizedLoadOp(uint16_t Opcode, unsigned int Size) const +{ + AMDGPU::Size InSize; + switch (Size) { + case 1: InSize = AMDGPU::Size_1; break; + case 2: InSize = AMDGPU::Size_2; break; + case 3: InSize = AMDGPU::Size_3; break; + case 4: InSize = AMDGPU::Size_4; break; + default: return -1; + } + return AMDGPU::getResizedLoadOp(Opcode, InSize); +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -43,6 +43,9 @@ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + void adjustLoadSize(MachineSDNode *&N, SelectionDAG &DAG) const; + + LaneBitmask findUsedLanes(SDNode *N) const; SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,8 @@ #include "llvm/IR/Function.h" #include "llvm/ADT/SmallString.h" +#define DEBUG_TYPE "si-isel-lowering" + using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM, @@ -2090,6 +2092,89 @@ } } +/// Returns bitmask of the lanes of the vector result of \p N that are actually +/// used. Early out is automatically taken when the highest lane is used; ~0 is +/// returned in this case. +LaneBitmask SITargetLowering::findUsedLanes(SDNode *N) const +{ + assert(N->getNumValues() == 1); + const unsigned NumElements = N->getValueType(0).getVectorNumElements(); + const LaneBitmask HighestMask = (LaneBitmask)1 << (NumElements - 1); + LaneBitmask Mask = 0; + + for (SDNode *use : N->uses()) { + if (!use->isMachineOpcode()) { + DEBUG(dbgs() << "findUsedLanes: non-machine opcode\n"); + return ~(LaneBitmask)0; + } + + if (use->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) { + DEBUG(dbgs() << "findUsedLanes: unsupported opcode " + << use->getMachineOpcode()); + return ~(LaneBitmask)0; + } + + Mask |= Subtarget->getRegisterInfo()->getSubRegIndexLaneMask( + use->getConstantOperandVal(1)); + if (Mask & HighestMask) + return ~(LaneBitmask)0; + } + + return Mask; +} + +/// Reduce size of a load instruction if only a prefix of returned channels +/// are used. Currently only used for BUFFER_LOAD_FORMAT_XYZW. +void SITargetLowering::adjustLoadSize( + MachineSDNode *&N, + SelectionDAG &DAG) const +{ + EVT OriginalType = N->getValueType(0); + + if (!OriginalType.isVector()) + return; + + const LaneBitmask Mask = findUsedLanes(N); + if (Mask == ~(LaneBitmask)0) + return; + if (!Mask) { + DEBUG(dbgs() << "adjustLoadSize: dead load has not been eliminated\n"); + return; + } + + const unsigned RequiredSize = findLastSet(Mask) + 1; + + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + const unsigned NewOpcode = TII->getResizedLoadOp( + N->getMachineOpcode(), RequiredSize); + + // Make a temporary copy of operands to avoid problems with in-place mutation. + std::vector Ops; + Ops.insert(Ops.end(), N->op_begin(), N->op_end()); + + if (RequiredSize > 1) { + // We do not adjust the type of the node here, because MachineValueTypes + // do not support v3f32 types properly. The correct machine register class + // will eventually be selected based on the opcode after the + // MachineInstruction is built. + N = static_cast(DAG.SelectNodeTo( + N, NewOpcode, OriginalType, Ops)); + } else { + // Bypass EXTRACT_SUBREG instructions here, because trying to deal with + // v1xx types is a headache. + const EVT NewType = OriginalType.getVectorElementType(); + N = static_cast(DAG.SelectNodeTo( + N, NewOpcode, NewType, Ops)); + + for (SDNode *use : N->uses()) { + assert(use->getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG); + + DAG.ReplaceAllUsesWith(use, N); + } + } +} + static bool isFrameIndexOp(SDValue Op) { if (Op.getOpcode() == ISD::AssertZext) Op = Op.getOperand(0); @@ -2125,11 +2210,15 @@ const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); - if (TII->isMIMG(Node->getMachineOpcode())) - adjustWritemask(Node, DAG); + const unsigned opcode = Node->getMachineOpcode(); - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + if (TII->isMIMG(opcode)) { + adjustWritemask(Node, DAG); + } else if (TII->isMUBUF(opcode)) { + if (TII->getResizedLoadOp(opcode, 1) >= 0) + adjustLoadSize(Node, DAG); + } else if (opcode == AMDGPU::INSERT_SUBREG || + opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -89,6 +89,11 @@ int VI = 1; } +class ResizableLoad { + string Group = group; + int Size = size; +} + //===----------------------------------------------------------------------===// // SI DAG Nodes //===----------------------------------------------------------------------===// @@ -2778,6 +2783,14 @@ let ValueCols = [["1"], ["2"], ["3"] ]; } +def getResizedLoadOp : InstrMapping { + let FilterClass = "ResizableLoad"; + let RowFields = ["Group", "idxen", "offen", "vaddr", "isPseudo", "Subtarget"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["1"], ["2"], ["3"], ["4"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "VOP2_REV"; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -918,16 +918,16 @@ defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < mubuf<0x00>, "buffer_load_format_x", VGPR_32 ->; +>, ResizableLoad <"buffer_load_format", 1>; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < mubuf<0x01>, "buffer_load_format_xy", VReg_64 ->; +>, ResizableLoad <"buffer_load_format", 2>; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < mubuf<0x02>, "buffer_load_format_xyz", VReg_96 ->; +>, ResizableLoad <"buffer_load_format", 3>; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 ->; +>, ResizableLoad <"buffer_load_format", 4>; defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < mubuf<0x04>, "buffer_store_format_x", VGPR_32 >;