Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -212,6 +212,9 @@ def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">, GISDNodeXFormEquiv; +def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">, + GISDNodeXFormEquiv; + def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">, GISDNodeXFormEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -175,6 +175,8 @@ void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2266,6 +2266,12 @@ MIB.addImm(MI.getOperand(OpIdx).getImm()); } +void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + MIB.addImm(MI.getOperand(OpIdx).getImm()); +} + bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2132,6 +2132,15 @@ constrainOpWithReadfirstlane(MI, MRI, 3); // Index return; } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + // Doing a waterfall loop over these wouldn't make any sense. + substituteSimpleCopyRegs(OpdMapper, 2); + substituteSimpleCopyRegs(OpdMapper, 3); + constrainOpWithReadfirstlane(MI, MRI, 4); + constrainOpWithReadfirstlane(MI, MRI, 5); + return; + } default: break; } @@ -3054,6 +3063,16 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_mfma_f32_4x4x1f32: case Intrinsic::amdgcn_mfma_f32_4x4x4f16: case Intrinsic::amdgcn_mfma_i32_4x4x4i8: Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -665,7 +665,7 @@ defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; -def as_i1imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); }]>; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1901,7 +1901,7 @@ timm:$bound_ctrl)), (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (as_i1timm $bound_ctrl)) >; def : GCNPat < @@ -1909,7 +1909,7 @@ timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (as_i1timm $bound_ctrl)) >; //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/SMInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SMInstructions.td +++ llvm/lib/Target/AMDGPU/SMInstructions.td @@ -769,22 +769,22 @@ // 1. Offset as an immediate def : GCNPat < (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc), - (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc), - (as_i1imm $dlc))) + (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1timm $glc), + (as_i1timm $dlc))) >; // 2. 32-bit IMM offset on CI def : GCNPat < (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)), - (!cast(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> { + (!cast(Instr#"_IMM_ci") $sbase, $offset, (as_i1timm $glc), (as_i1timm $dlc))> { let OtherPredicates = [isGFX7Only]; } // 3. Offset loaded in an 32bit SGPR def : GCNPat < (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc), - (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc), - (as_i1imm $dlc))) + (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1timm $glc), + (as_i1timm $dlc))) >; } Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -839,7 +839,7 @@ timm:$bound_ctrl)), (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (as_i1timm $bound_ctrl)) >; def : GCNPat < @@ -847,7 +847,7 @@ timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (as_i1timm $bound_ctrl)) >; } // End OtherPredicates = [isGFX8Plus] Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -643,8 +643,8 @@ Instruction inst> : GCNPat< (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), - (inst (as_i1imm $fi), $src0, (as_i1imm $bc), - $src1, 0, $src2, $vdst_in) + (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc), + SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; // Permlane intrinsic that has either fetch invalid or bound control @@ -656,13 +656,19 @@ $src1, node:$src2, node:$fi, node:$bc)> { let PredicateCode = [{ return N->getConstantOperandVal(5) != 0 || N->getConstantOperandVal(6) != 0; }]; + let GISelPredicateCode = [{ + return MI.getOperand(6).getImm() != 0 || + MI.getOperand(7).getImm() != 0; + }]; } // Drop the input value if it won't be read. class PermlaneDiscardVDstIn : GCNPat< - (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), - (inst (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, + (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc), + (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc), + SCSrc_b32:$src1, 0, SCSrc_b32:$src2, (IMPLICIT_DEF)) >; @@ -685,7 +691,6 @@ def : PermlaneDiscardVDstIn< BoundControlOrFetchInvalidPermlane, V_PERMLANEX16_B32>; - } // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -287,7 +287,7 @@ (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), - (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; + (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1timm $clamp))>; } defm : DotPats; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.permlane.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.permlane.ll @@ -0,0 +1 @@ +; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %S/../llvm.amdgcn.permlane.ll | FileCheck -check-prefixes=GCN,GFX10 %S/../llvm.amdgcn.permlane.ll Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1