Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -97,6 +97,9 @@ void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; +void initializeSIFixupVectorISelPass(PassRegistry &); +extern char &SIFixupVectorISelID; + void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -142,6 +142,7 @@ initializeR600VectorRegMergerPass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); + initializeSIFixupVectorISelPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); @@ -753,6 +754,7 @@ AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); + addPass(&SIFixupVectorISelID); return false; } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -82,6 +82,7 @@ SIDebuggerInsertNops.cpp SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp + SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIFoldOperands.cpp SIFrameLowering.cpp Index: lib/Target/AMDGPU/SIFixupVectorISel.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -0,0 +1,148 @@ +//===-- SIFixupVectorISel.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Apply simple patterns that should be done in the DAG, but require knowing if +// we have a vector source or not. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fixup-vector-isel" + +namespace { + +class SIFixupVectorISel : public MachineFunctionPass { +public: + static char ID; + + SIFixupVectorISel() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Fixup Vector ISel"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) +INITIALIZE_PASS_END(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) + + +char SIFixupVectorISel::ID = 0; + +char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; + +static bool isVALUShl16(const MachineRegisterInfo &MRI, + MachineInstr &MI, MachineOperand *&Op) { + unsigned Opc = MI.getOpcode(); + if (Opc != AMDGPU::V_LSHLREV_B32_e64 && + Opc != AMDGPU::V_LSHLREV_B32_e32 && + Opc != AMDGPU::V_LSHL_B32_e64 && + Opc != AMDGPU::V_LSHL_B32_e32) + return false; + + MachineOperand *LHS = &MI.getOperand(1); + MachineOperand *RHS = &MI.getOperand(2); + + if (Opc == AMDGPU::V_LSHLREV_B32_e64 || Opc == AMDGPU::V_LSHLREV_B32_e32) + std::swap(LHS, RHS); + + uint64_t ShiftAmt; + if (SIInstrInfo::isImmOrMaterializedImm(MRI, *RHS, ShiftAmt) && + ShiftAmt == 16) { + Op = LHS; + return true; + } + + if (SIInstrInfo::isImmOrMaterializedImm(MRI, *LHS, ShiftAmt) && + ShiftAmt == 16) { + Op = RHS; + return true; + } + + return false; +} + +bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ) { + MachineInstr &MI = *I; + ++I; + + switch (MI.getOpcode()) { + case AMDGPU::V_OR_B32_e64: { + // TODO: This is useful in some situations even with SDWA, but many + // tests need updates and the logic probably needs merging with the SDWA + // pass. + if (ST.hasSDWA()) + break; + + MachineOperand &RHS = MI.getOperand(2); + if (!RHS.isReg()) + break; + + MachineOperand &LHS = MI.getOperand(1); + MachineInstr *Def = MRI.getVRegDef(RHS.getReg()); + + MachineOperand *Op; + if (isVALUShl16(MRI, *Def, Op)) { + uint64_t KnownZero = 0, KnownOne = 0; + TII->computeKnownBits(MRI, LHS, KnownZero, KnownOne); + + bool LHSKnownHighZero = (KnownZero & 0xffff0000) == 0xffff0000; + + const DebugLoc &DL = MI.getDebugLoc(); + if (LHSKnownHighZero) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CVT_PK_U16_U32_e64), + MI.getOperand(0).getReg()) + .add(LHS) + .add(*Op); + MI.eraseFromParent(); + MRI.clearKillFlags(Op->getReg()); + continue; + } + } + + + } + default: + break; + } + + } + } + + return true; +} Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -825,6 +825,10 @@ const DebugLoc &DL, unsigned DestReg) const; + static bool isImmOrMaterializedImm(const MachineRegisterInfo &MRI, + const MachineOperand &Op, + uint64_t &Imm); + void computeKnownBits(const MachineRegisterInfo &MRI, const MachineOperand &Op, uint64_t &KnownZero, uint64_t &KnownOne, unsigned Depth = 0) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4370,9 +4370,9 @@ .addReg(UnusedCarry, RegState::Define | RegState::Dead); } -static bool isImmOrMaterializedImm(const MachineRegisterInfo &MRI, - const MachineOperand &Op, - uint64_t &Imm) { +bool SIInstrInfo::isImmOrMaterializedImm(const MachineRegisterInfo &MRI, + const MachineOperand &Op, + uint64_t &Imm) { if (Op.isImm()) { Imm = Op.getImm(); return true; @@ -4404,7 +4404,7 @@ uint64_t &KnownOne, unsigned Depth) { uint64_t ShiftAmt; - if (!isImmOrMaterializedImm(MRI, RHS, ShiftAmt)) + if (!SIInstrInfo::isImmOrMaterializedImm(MRI, RHS, ShiftAmt)) return; switch (Opcode) { @@ -4462,7 +4462,7 @@ return; unsigned Reg = Op.getReg(); - const MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + const MachineInstr *Def = MRI.getVRegDef(Reg); if (!Def) return; @@ -4529,6 +4529,10 @@ return; } + case AMDGPU::V_CVT_F16_F32_e64: + case AMDGPU::V_CVT_F16_F32_e32: + case AMDGPU::V_ADD_U16_e64: + case AMDGPU::V_ADD_U16_e32: case AMDGPU::FLAT_LOAD_USHORT: case AMDGPU::BUFFER_LOAD_USHORT_ADDR64: case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: @@ -4555,6 +4559,6 @@ } return; - } + } } } Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -14,8 +14,7 @@ ; CI: v_ashrrev_i32_e32 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI: v_cvt_pk_u16_u32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -38,9 +37,8 @@ ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_pk_u16_u32 define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone @@ -12,7 +12,7 @@ ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 @@ -26,7 +26,7 @@ ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 @@ -42,7 +42,7 @@ ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 @@ -60,7 +60,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -86,7 +86,7 @@ ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -105,11 +105,11 @@ ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 -; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, ; SI-DAG: v_add_i32 +; SI: v_cvt_pk_u16_u32_e32 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00, ; VI-DAG: v_add_u16_e32 @@ -135,7 +135,7 @@ ; GCN: s_endpgm define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 @@ -160,7 +160,7 @@ ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 @@ -174,7 +174,7 @@ ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 @@ -186,7 +186,7 @@ ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 @@ -200,7 +200,7 @@ ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float @@ -211,7 +211,7 @@ ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> @@ -226,7 +226,7 @@ ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep %and = and i32 %val, 255 %cvt = uitofp i32 %and to float @@ -241,7 +241,7 @@ ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 @@ -257,7 +257,7 @@ ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 @@ -273,7 +273,7 @@ ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -32,9 +32,9 @@ ; GCN-LABEL: {{^}}s_fabs_v2f16: ; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_or_b32_e32 +; CI-NOT: v_or_b32 +; CI: v_cvt_pk_u16_u32_e32 ; VI: flat_load_ushort [[HI:v[0-9]+]] ; VI: flat_load_ushort [[LO:v[0-9]+]] Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.f16.ll +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fadd_f16 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] @@ -63,19 +63,20 @@ ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] + ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -107,8 +108,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -138,8 +139,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.f16.ll +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -60,7 +60,7 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16: +; GCN-LABEL: {{^}}v_fmul_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] @@ -74,8 +74,8 @@ ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -83,7 +83,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define amdgpu_kernel void @fmul_v2f16( +define amdgpu_kernel void @v_fmul_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -104,11 +104,15 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + +; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 +; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI-NOT: v_lshl +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( @@ -130,11 +134,14 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + +; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( @@ -146,3 +153,8 @@ store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptosi.f16.ll +++ test/CodeGen/AMDGPU/fptosi.f16.ll @@ -59,8 +59,8 @@ ; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] -; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_1]] ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -37,8 +37,8 @@ ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] @@ -65,10 +65,10 @@ ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD - -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.f16.ll +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -74,8 +74,8 @@ ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -108,8 +108,9 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_lshl +; SI-NOT: v_or +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -142,8 +143,9 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; half args should be promoted to float for SI and lower. @@ -16,8 +16,13 @@ ; GCN-LABEL: {{^}}load_v2f16_arg: ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; SI-NOT: v_lshl +; SI-NOT: v_or +; SI: v_cvt_pk_u16_u32_e32 [[PACKED:v[0-9]+]], [[V0]], [[V1]] + +; VI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] + ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { @@ -470,8 +475,9 @@ ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] ; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] +; SI-NOT: v_lshl +; SI-NOT: v_or +; SI: v_cvt_pk_u16_u32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] ; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.ceil.f16(half %a) declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) @@ -31,9 +31,10 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] + +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.cos.f16(half %a) declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) @@ -25,12 +25,12 @@ ; GCN-LABEL: {{^}}cos_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} +; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}} ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -47,8 +47,8 @@ ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GCN-NOT: and -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/llvm.exp2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -31,9 +31,9 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -31,9 +31,10 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: or +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -119,16 +119,19 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16( @@ -161,13 +164,16 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_a( @@ -202,15 +208,18 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_b( @@ -245,15 +254,18 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lsh +; SI-NOT: v_and +; SI-NOT: v_or +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_c( Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -70,8 +70,8 @@ ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] +; SI-DAG: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] ; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] @@ -87,8 +87,8 @@ half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { - %a.val = load half, half addrspace(1)* %a - %c.val = load half, half addrspace(1)* %c + %a.val = load volatile half, half addrspace(1)* %a + %c.val = load volatile half, half addrspace(1)* %c %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val) store half %r.val, half addrspace(1)* %r ret void @@ -111,11 +111,12 @@ ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] +; SI-NOT: v_or +; SI-NOT: v_lshl +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_1]] ; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.log2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.log2.f16(half %a) declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) @@ -24,16 +24,17 @@ ; GCN-LABEL: {{^}}log2_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI-DAG: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: or +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -77,9 +77,9 @@ ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -109,13 +109,17 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI-NOT: v_lshl +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( @@ -137,13 +141,17 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 -; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + +; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 +; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] +; VI-NOT: v_lshl +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_b( Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -76,9 +76,9 @@ ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -109,14 +109,16 @@ ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: and +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_a( @@ -138,13 +140,17 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-NOT: v_lshl +; SI-NOT: v_and +; SI-NOT: v_or +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 -; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_b( Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,VI,GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -32,9 +32,10 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) @@ -46,8 +46,7 @@ ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] Index: test/CodeGen/AMDGPU/llvm.sqrt.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,10 +1,10 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) -; GCN-LABEL: {{^}}sqrt_f16 +; GCN-LABEL: {{^}}sqrt_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_sqrt_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] @@ -22,7 +22,7 @@ ret void } -; GCN-LABEL: {{^}}sqrt_v2f16 +; GCN-LABEL: {{^}}sqrt_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -31,9 +31,9 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -31,9 +31,7 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 Index: test/CodeGen/AMDGPU/load-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/load-weird-sizes.ll +++ test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -1,13 +1,19 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI-NOHSA,SI,GCN,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI-HSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI-NOHSA,VI,GCN,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,VM,FUNC %s +; FIXME: use cvt_pk on VI ; FUNC-LABEL: {{^}}load_i24: -; SI: {{flat|buffer}}_load_ubyte -; SI: {{flat|buffer}}_load_ushort -; SI: {{flat|buffer}}_store_dword +; GCN-DAG: {{flat|buffer}}_load_ubyte +; GCN-DAG: {{flat|buffer}}_load_ushort +; SI: v_cvt_pk_u16_u32_e32 + +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; VI: v_or_b32_e32 + +; GCN: {{flat|buffer}}_store_dword define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { %1 = load i24, i24 addrspace(1)* %in %2 = zext i24 %1 to i32 @@ -16,8 +22,8 @@ } ; FUNC-LABEL: {{^}}load_i25: -; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOHSA: buffer_store_dword [[VAL]] +; GCN-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOHSA: buffer_store_dword [[VAL]] ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s ; GCN-LABEL: {{^}}s_lshr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -9,10 +9,13 @@ ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] ; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_pk_u16_u32 +; CI-NOT: v_bfe_u32 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/scalar_to_vector.ll =================================================================== --- test/CodeGen/AMDGPU/scalar_to_vector.ll +++ test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,14 +1,20 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s + +; FIXME: SHould use cvt_pk_u16_u32 on VI, need to form when SDWA isn't +; usable. -; XXX - Why the packing? ; GCN-LABEL: {{^}}scalar_to_vector_v2i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] -; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] -; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]] -; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] -; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} + +; SI: v_cvt_pk_u16_u32_e32 v[[PACK:[0-9]+]], [[SHR]], [[SHR]] + +; VI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] +; VI: v_or_b32_e32 v[[PACK:[0-9]+]], [[SHR]], [[SHL]] + +; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[PACK]] +; GCN: buffer_store_dwordx2 v{{\[}}[[PACK]]:[[COPY]]{{\]}} define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s ; GCN-LABEL: {{^}}s_shl_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -15,8 +15,9 @@ ; CI: v_lshlrev_b32_e32 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI-NOT: v_lshl +; CI-NOT: v_or_b32 +; CI: v_cvt_pk_u16_u32_e32 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -24,22 +25,26 @@ } ; GCN-LABEL: {{^}}v_shl_v2i16: -; GCN: {{buffer|flat|global}}_load_dword [[LHS:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[RHS:v[0-9]+]] +; GFX9: global_load_dword [[LHS:v[0-9]+]] +; GFX9: global_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +; VI: flat_load_dword [[LHS:v[0-9]+]] +; VI: flat_load_dword [[RHS:v[0-9]+]] ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: buffer_load_dword [[LHS:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[RHS:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_pk_u16_u32_e32 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/sitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/sitofp.f16.ll +++ test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}sitofp_i16_to_f16 ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] @@ -42,8 +42,9 @@ ; SI: v_cvt_f32_i32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 +; SI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e3 ; VI-DAG: v_cvt_f32_i32_sdwa ; VI-DAG: v_cvt_f32_i32_sdwa @@ -71,8 +72,9 @@ ; SI: v_cvt_f32_i32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 +; SI-NOT: v_lshl +; SI-NOT: v_or_ +; SI: v_cvt_pk_u16_u32_e32 ; VI-DAG: v_cvt_f32_i32_e32 ; VI-DAG: v_cvt_f32_i32_e32 Index: test/CodeGen/AMDGPU/uitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/uitofp.f16.ll +++ test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}uitofp_i16_to_f16 ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] @@ -43,8 +43,9 @@ ; SI: v_cvt_f32_u32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 +; SI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32 ; VI-DAG: v_cvt_f16_f32_e32 ; VI-DAG: v_cvt_f32_i32_sdwa @@ -71,8 +72,9 @@ ; SI: v_cvt_f32_u32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 +; SI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI: v_cvt_pk_u16_u32_e32 ; VI-DAG: v_cvt_f32_u32_e32 ; VI-DAG: v_cvt_f32_u32_e32 Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -316,9 +316,9 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_lshl ; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] +; SI: v_cvt_pk_u16_u32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_1]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1