Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1437,6 +1437,18 @@ // GFX10 Intrinsics //===----------------------------------------------------------------------===// +// llvm.amdgcn.permlane16 +def int_amdgcn_permlane16 : + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent]>; + +// llvm.amdgcn.permlanex16 +def int_amdgcn_permlanex16 : + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent]>; + def int_amdgcn_s_get_waveid_in_workgroup : GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, Intrinsic<[llvm_i32_ty], [], [IntrReadMem]>; Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1148,6 +1148,7 @@ bool validateMIMGD16(const MCInst &Inst); bool validateMIMGDim(const MCInst &Inst); bool validateLdsDirect(const MCInst &Inst); + bool validateOpSel(const MCInst &Inst); bool validateVOP3Literal(const MCInst &Inst) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -3003,6 +3004,19 @@ return NumLiterals <= 1; } +bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if (OpSel & ~3) + return false; + } + return true; +} + // VOP3 literal is only allowed in GFX10+ and only one can be used bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { unsigned Opcode = Inst.getOpcode(); @@ -3071,6 +3085,11 @@ "integer clamping is not supported on this GPU"); return false; } + if (!validateOpSel(Inst)) { + Error(IDLoc, + "invalid op_sel operand"); + return false; + } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(IDLoc, Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -84,8 +84,8 @@ int checkAnyInstHazards(MachineInstr *MI); int checkReadM0Hazards(MachineInstr *SMovRel); int checkNSAtoVMEMHazard(MachineInstr *MI); - void fixHazards(MachineInstr *MI); + bool fixVcmpxPermlaneHazards(MachineInstr *MI); bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); bool fixVcmpxExecWARHazard(MachineInstr *MI); Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -115,6 +115,12 @@ } } +static bool isPermlane(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == AMDGPU::V_PERMLANE16_B32 || + Opcode == AMDGPU::V_PERMLANEX16_B32; +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -835,11 +841,49 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVMEMtoScalarWriteHazards(MI); + fixVcmpxPermlaneHazards(MI); fixSMEMtoVectorWriteHazards(MI); fixVcmpxExecWARHazard(MI); fixLdsBranchVmemWARHazard(MI); } +bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { + if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isVOPC(*MI); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + if (!MI) + return false; + unsigned Opc = MI->getOpcode(); + return SIInstrInfo::isVALU(*MI) && + Opc != AMDGPU::V_NOP_e32 && + Opc != AMDGPU::V_NOP_e64 && + Opc != AMDGPU::V_NOP_sdwa; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + // V_NOP will be discarded by SQ. + // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // which is always a VGPR and available. + auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + unsigned Reg = Src0->getReg(); + bool IsUndef = Src0->isUndef(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32)) + .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) + .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); + + return true; +} + bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { if (!ST.hasVMEMtoScalarWriteHazard()) return false; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1005,6 +1005,18 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0); + unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0); + if (FI || BC) + O << " op_sel:[" << FI << ',' << BC << ']'; + return; + } + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -9708,6 +9708,24 @@ Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::V_PERMLANE16_B32: + case AMDGPU::V_PERMLANEX16_B32: { + ConstantSDNode *FI = cast(Node->getOperand(0)); + ConstantSDNode *BC = cast(Node->getOperand(2)); + if (!FI->getZExtValue() && !BC->getZExtValue()) + break; + SDValue VDstIn = Node->getOperand(6); + if (VDstIn.isMachineOpcode() + && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) + break; + MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(Node), MVT::i32); + SmallVector Ops = { SDValue(FI, 0), Node->getOperand(1), + SDValue(BC, 0), Node->getOperand(3), + Node->getOperand(4), Node->getOperand(5), + SDValue(ImpDef, 0), Node->getOperand(7) }; + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } default: break; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3790,6 +3790,26 @@ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) }; + if (Opc == AMDGPU::V_PERMLANE16_B32 || + Opc == AMDGPU::V_PERMLANEX16_B32) { + // src1 and src2 must be scalar + MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); + MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); + const DebugLoc &DL = MI.getDebugLoc(); + if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + } + if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src2); + Src2.ChangeToRegister(Reg, false); + } + } + // Find the one SGPR operand we are allowed to use. int ConstantBusLimit = ST.getConstantBusLimit(Opc); int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -625,9 +625,35 @@ } // End SubtargetPredicate = isGFX9Plus +def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { + let Src0RC64 = VRegSrc_32; + let Src1RC64 = SCSrc_b32; + let Src2RC64 = SCSrc_b32; + let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, + IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1, + IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2, + VGPR_32:$vdst_in, op_sel:$op_sel); + let HasClamp = 0; + let HasOMod = 0; +} + let SubtargetPredicate = isGFX10Plus in { def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile>; def : ThreeOp_i32_Pats; + + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>; + def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + } // End $vdst = $vdst_in, DisableEncoding $vdst_in + + def : GCNPat< + (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; + def : GCNPat< + (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; } // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// @@ -790,6 +816,8 @@ defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">; defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">; defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">; +defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>; +defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>; //===----------------------------------------------------------------------===// // GFX7, GFX10. Index: test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -0,0 +1,311 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() + +; GCN-LABEL: {{^}}v_permlane16_b32_vss: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vii: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}} +define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vll: +; FIXME-GFX10: It is allowed to have both immediates as literals +; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234 +; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vvv: +; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 +; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vvs: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vsv: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vss: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vii: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vll: +; FIXME-GFX10: It is allowed to have both immediates as literals +; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234 +; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vvv: +; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 +; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vvs: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vsv: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1 +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc: +; GFX10-NOT: v_readfirstlane_b32 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid: +; GFX10: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid: +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_i_tid: +; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039 +; GFX10: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi: +; GFX10-NOT: 0x3039 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc: +; GFX10-NOT: 0x3039 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc: +; GFX10-NOT: 0x3039 +; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid: +; GFX10: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid: +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid: +; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039 +; GFX10: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}} +define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi: +; GFX10-NOT: 0x3039 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc: +; GFX10-NOT: 0x3039 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc: +; GFX10-NOT: 0x3039 +; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}} +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 { + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -0,0 +1,145 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: hazard_vcmpx_permlane16 +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: S_ADD_U32 +# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_B32 +--- +name: hazard_vcmpx_permlane16 +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlanex16 +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANEX16_B32 +--- +name: hazard_vcmpx_permlanex16 +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr1 = V_PERMLANEX16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16_v_nop +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: V_NOP +# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_B32 +--- +name: hazard_vcmpx_permlane16_v_nop +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + V_NOP_e32 implicit $exec + $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16_far +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_B32 +--- +name: hazard_vcmpx_permlane16_far +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16_no_hazard +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: V_ADD_F32 +# GCN-NEXT: V_PERMLANE16_B32 +--- +name: hazard_vcmpx_permlane16_no_hazard +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16_undef_src +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: S_ADD_U32 +# GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_B32 +--- +name: hazard_vcmpx_permlane16_undef_src +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_PERMLANE16_B32 0, undef $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, undef $vgpr1, 0, implicit $exec + S_ENDPGM 0 +...