Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td @@ -238,6 +238,36 @@ "Support SDWA (Sub-DWORD Addressing) extension" >; +def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod", + "HasSDWAOmod", + "true", + "Support OMod with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar", + "HasSDWAScalar", + "true", + "Support scalar register with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst", + "HasSDWASdst", + "true", + "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", + "HasSDWAMac", + "true", + "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" +>; + +def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc", + "HasSDWAClampVOPC", + "true", + "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" +>; + def FeatureDPP : SubtargetFeature<"dpp", "HasDPP", "true", @@ -421,8 +451,8 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, - FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, - FeatureDPP + FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP ] >; @@ -432,7 +462,8 @@ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureSDWA, FeatureDPP, + FeatureFastFMAF32, FeatureDPP, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -149,6 +149,11 @@ bool HasScalarStores; bool HasInv2PiInlineImm; bool HasSDWA; + bool HasSDWAOmod; + bool HasSDWAScalar; + bool HasSDWASdst; + bool HasSDWAMac; + bool HasSDWAClampVOPC; bool HasDPP; bool FlatAddressSpace; bool FlatInstOffsets; @@ -431,6 +436,26 @@ return HasSDWA; } + bool hasSDWAOmod() const { + return HasSDWAOmod; + } + + bool hasSDWAScalar() const { + return HasSDWAScalar; + } + + bool hasSDWASdst() const { + return HasSDWASdst; + } + + bool hasSDWAMac() const { + return HasSDWAMac; + } + + bool hasSDWAClampVOPC() const { + return HasSDWAClampVOPC; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -124,6 +124,11 @@ HasScalarStores(false), HasInv2PiInlineImm(false), HasSDWA(false), + HasSDWAOmod(false), + HasSDWAScalar(false), + HasSDWASdst(false), + HasSDWAMac(false), + HasSDWAClampVOPC(false), HasDPP(false), FlatAddressSpace(false), FlatInstOffsets(false), Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2454,7 +2454,7 @@ continue; const MachineOperand &MO = MI.getOperand(OpIdx); - if (AMDGPU::isVI(ST)) { + if (!ST.hasSDWAScalar()) { // Only VGPRS on VI if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; @@ -2469,7 +2469,7 @@ } } - if (AMDGPU::isVI(ST)) { + if (!ST.hasSDWAOmod()) { // No omod allowed on VI const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); if (OMod != nullptr && @@ -2481,14 +2481,14 @@ uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); if (isVOPC(BasicOpcode)) { - if (AMDGPU::isVI(ST) && DstIdx != -1) { + if (!ST.hasSDWASdst() && DstIdx != -1) { // Only vcc allowed as dst on VI for VOPC const MachineOperand &Dst = MI.getOperand(DstIdx); if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; return false; } - } else if (AMDGPU::isGFX9(ST)) { + } else if (!ST.hasSDWAClampVOPC()) { // No clamp allowed on GFX9 for VOPC const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); if (Clamp != nullptr && Index: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -67,9 +67,9 @@ bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); - bool isConvertibleToSDWA(const MachineInstr &MI) const; + bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); - void legalizeScalarOperands(MachineInstr &MI) const; + void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -607,24 +607,38 @@ } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, + const SISubtarget &ST) const { // Check if this instruction has opcode that supports SDWA - unsigned Opc = MI.getOpcode(); - if (AMDGPU::getSDWAOp(Opc) != -1) - return true; - int Opc32 = AMDGPU::getVOPe32(Opc); - if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1) { - if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) - return false; + int Opc = MI.getOpcode(); + if (AMDGPU::getSDWAOp(Opc) == -1) + Opc = AMDGPU::getVOPe32(Opc); + + if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1) + return false; + + if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; - if (TII->isVOPC(Opc)) { + if (TII->isVOPC(Opc)) { + if (!ST.hasSDWASdst()) { const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - return SDst && SDst->getReg() == AMDGPU::VCC; - } else { - return !TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (SDst && SDst->getReg() != AMDGPU::VCC) + return false; } + + if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + return false; } - return false; + + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + Opc == AMDGPU::V_MAC_F32_e32)) + return false; + + return true; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, @@ -690,13 +704,23 @@ SDWAInst.add(*Src2); } - // Initialize clamp. - if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1) + // Copy clamp if present, initialize otherwise + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); + if (Clamp) { + SDWAInst.add(*Clamp); + } else { SDWAInst.addImm(0); + } - // Initialize omod. - if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) + // Copy omod if present, initialize otherwise if needed + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1); + SDWAInst.add(*OMod); + } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { SDWAInst.addImm(0); + } // Initialize dst_sel and dst_unused if present if (Dst) { @@ -750,16 +774,25 @@ } // If an instruction was converted to SDWA it should not have immediates or SGPR -// operands. Copy its scalar operands into VGPRs. -void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const { +// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. +void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { const MCInstrDesc &Desc = TII->get(MI.getOpcode()); - for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { - MachineOperand &Op = MI.getOperand(I); + unsigned ConstantBusCount = 0; + for (MachineOperand &Op: MI.explicit_uses()) { if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) continue; + + unsigned I = MI.getOperandNo(&Op); if (Desc.OpInfo[I].RegClass == -1 || !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) continue; + + if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && + TRI->isSGPRReg(*MRI, Op.getReg())) { + ++ConstantBusCount; + continue; + } + unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), VGPR); @@ -775,10 +808,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); - if (!ST.hasSDWA() || - !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + if (!ST.hasSDWA()) return false; - } MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); @@ -790,7 +821,7 @@ for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } @@ -805,7 +836,7 @@ bool Ret = !ConvertedInstructions.empty(); while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val()); + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); return Ret; } Index: llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td @@ -401,10 +401,6 @@ let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - // string Mnemonic = ps.Mnemonic; - // string AsmOperands = ps.AsmOperands; - // string AsmOperands9 = ps.AsmOperands9; - // Copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = ps.AssemblerPredicate; Index: llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll @@ -134,11 +134,10 @@ ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fneg = fsub <2 x half> , %val Index: llvm/trunk/test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fpext.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -35,11 +35,10 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] -; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm @@ -55,9 +54,9 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 ; GCN: buffer_load_dword -; SIGFX9-DAG: v_lshrrev_b32_e32 -; SIGFX9-DAG: v_cvt_f32_f16_e32 -; VI: v_cvt_f32_f16_sdwa +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_cvt_f32_f16_e32 +; GFX89: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f64_f32_e32 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -42,13 +42,13 @@ ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @rint_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { Index: llvm/trunk/test/CodeGen/AMDGPU/sdwa-gfx9.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sdwa-gfx9.mir +++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-gfx9.mir @@ -0,0 +1,88 @@ +# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +# GCN-LABEL: {{^}}name: add_shr_i32 +# GCN: [[SMOV:%[0-9]+]] = S_MOV_B32 123 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_ADD_I32_e32 [[SMOV]], killed [[SHIFT]], implicit-def %vcc, implicit %exec + +# VI: [[VMOV:%[0-9]+]] = V_MOV_B32_e32 [[SMOV]], implicit %exec +# VI: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +# GFX9: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +--- +name: add_shr_i32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: sreg_32_xm0 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %12 = S_MOV_B32 123 + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_ADD_I32_e32 %12, killed %10, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 + +... + +# GCN-LABEL: {{^}}name: trunc_shr_f32 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +# VI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# VI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +#GFX9: %{{[0-9]+}} = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit %exec + +--- +name: trunc_shr_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 Index: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -72,9 +73,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -93,12 +96,15 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -117,18 +123,23 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -162,9 +173,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] + +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 @@ -182,10 +196,13 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -204,14 +221,19 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -245,7 +267,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 + +; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { entry: @@ -264,9 +290,13 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { entry: @@ -285,12 +315,19 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { entry: @@ -330,8 +367,11 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mac_f16_sdwa -; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] + +; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] +; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: @@ -345,10 +385,13 @@ ; GCN-LABEL: {{^}}immediate_mul_v2i16: ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 -; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 +; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]] define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: @@ -367,7 +410,10 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -382,7 +428,9 @@ ; GCN-LABEL: {{^}}add_bb_v2i16: ; NOSDWA-NOT: v_add_i32_sdwa -; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -408,9 +456,11 @@ ; NOSDWA-NOT: v_and_b32_sdwa ; NOSDWA-NOT: v_or_b32_sdwa -; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD Index: llvm/trunk/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s # GCN-LABEL: {{^}}sdwa_imm_operand: # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 @@ -8,11 +9,17 @@ # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 # GCN-LABEL: {{^}}sdwa_sgpr_operand: -# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 -# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 -# GCN: BB1_1: -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# VI: BB1_1: +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 + +# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2 +# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# GFX9: BB1_1: +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 --- | ; ModuleID = 'sdwa-scalar-ops.opt.ll'