Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -244,16 +244,23 @@ return B.buildConcatVectors(DstRegs[0], SrcRegs); } - const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); - Register Undef = B.buildUndef(PartLLT).getReg(0); + Register UnmergeSrcReg; + if (LCMTy != PartLLT) { + const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); + Register Undef = B.buildUndef(PartLLT).getReg(0); - // Build vector of undefs. - SmallVector WidenedSrcs(NumWide, Undef); + // Build vector of undefs. + SmallVector WidenedSrcs(NumWide, Undef); - // Replace the first sources with the real registers. - std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); + // Replace the first sources with the real registers. + std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); + UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0); + } else { + // Handle widening a smaller vector to larger only. + assert(SrcRegs.size() == 1); + UnmergeSrcReg = SrcRegs[0]; + } - auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); SmallVector PadDstRegs(NumDst); @@ -263,7 +270,7 @@ for (int I = DstRegs.size(); I != NumDst; ++I) PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); - return B.buildUnmerge(PadDstRegs, Widened); + return B.buildUnmerge(PadDstRegs, UnmergeSrcReg); } /// Create a sequence of instructions to combine pieces split into register @@ -274,6 +281,16 @@ ArrayRef Regs, LLT LLTy, LLT PartLLT) { MachineRegisterInfo &MRI = *B.getMRI(); + // We could just insert a regular copy, but this is unreachable at the moment. + assert(LLTy != PartLLT && "identical part types shouldn't reach here"); + + if (PartLLT.isVector() == LLTy.isVector() && + PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits()) { + assert(OrigRegs.size() == 1 && Regs.size() == 1); + B.buildTrunc(OrigRegs[0], Regs[0]); + return; + } + if (!LLTy.isVector() && !PartLLT.isVector()) { assert(OrigRegs.size() == 1); LLT OrigTy = MRI.getType(OrigRegs[0]); @@ -289,9 +306,9 @@ return; } - if (LLTy.isVector() && PartLLT.isVector()) { - assert(OrigRegs.size() == 1); - assert(LLTy.getElementType() == PartLLT.getElementType()); + if (PartLLT.isVector()) { + assert(OrigRegs.size() == 1 && + LLTy.getScalarType() == PartLLT.getElementType()); mergeVectorRegsToResultRegs(B, OrigRegs, Regs); return; } @@ -341,6 +358,65 @@ } } +static void buildCopyFromParts(MachineIRBuilder &B, ArrayRef DstRegs, + Register SrcReg, LLT SrcTy, LLT PartTy) { + // We could just insert a regular copy, but this is unreachable at the moment. + assert(SrcTy != PartTy && "identical part types shouldn't reach here"); + + const unsigned PartSize = PartTy.getSizeInBits(); + + if (PartTy.isVector() == SrcTy.isVector() && + PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) { + assert(DstRegs.size() == 1); + B.buildAnyExt(DstRegs[0], SrcReg); + return; + } + + if (SrcTy.isVector() && !PartTy.isVector() && + PartSize > SrcTy.getElementType().getSizeInBits()) { + // Vector was scalarized, and the elements extended. + auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); + for (int i = 0, e = DstRegs.size(); i != e; ++i) + B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); + return; + } + + LLT GCDTy = getGCDType(SrcTy, PartTy); + if (GCDTy == PartTy) { + // If this already evenly divisible, we can create a simple unmerge. + B.buildUnmerge(DstRegs, SrcReg); + return; + } + + MachineRegisterInfo &MRI = *B.getMRI(); + LLT DstTy = MRI.getType(DstRegs[0]); + LLT LCMTy = getLCMType(SrcTy, PartTy); + + const unsigned LCMSize = LCMTy.getSizeInBits(); + const unsigned DstSize = DstTy.getSizeInBits(); + const unsigned SrcSize = SrcTy.getSizeInBits(); + + Register UnmergeSrc = SrcReg; + if (LCMSize != SrcSize) { + // Widen to the common type. + Register Undef = B.buildUndef(SrcTy).getReg(0); + SmallVector MergeParts(1, SrcReg); + for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) + MergeParts.push_back(Undef); + + UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); + } + + // Unmerge to the original registers and pad with dead defs. + SmallVector UnmergeResults(DstRegs.begin(), DstRegs.end()); + for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; + Size += DstSize) { + UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); + } + + B.buildUnmerge(UnmergeResults, UnmergeSrc); +} + bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, SmallVectorImpl &Args, ValueHandler &Handler, @@ -362,6 +438,7 @@ ValueHandler &Handler, Register ThisReturnReg) const { MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); @@ -387,10 +464,20 @@ if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i], Args[i].Flags[0], CCInfo)) return false; + + // If we couldn't directly assign this part, some casting may be + // necessary. Create the new register, but defer inserting the conversion + // instructions. + assert(Args[i].OrigRegs.empty()); + Args[i].OrigRegs.push_back(Args[i].Regs[0]); + assert(Args[i].Regs.size() == 1); + + const LLT VATy(NewVT); + Args[i].Regs[0] = MRI.createGenericVirtualRegister(VATy); continue; } - assert(NumParts > 1); + const LLT NewLLT(NewVT); // For incoming arguments (physregs to vregs), we could have values in // physregs (or memlocs) which we want to extract and copy to vregs. @@ -407,13 +494,11 @@ Args[i].OrigRegs.push_back(Args[i].Regs[0]); Args[i].Regs.clear(); Args[i].Flags.clear(); - LLT NewLLT = getLLTForMVT(NewVT); // For each split register, create and assign a vreg that will store // the incoming component of the larger value. These will later be // merged to form the final vreg. for (unsigned Part = 0; Part < NumParts; ++Part) { - Register Reg = - MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT); + Register Reg = MRI.createGenericVirtualRegister(NewLLT); ISD::ArgFlagsTy Flags = OrigFlags; if (Part == 0) { Flags.setSplit(); @@ -431,12 +516,13 @@ } } } else { + assert(Args[i].Regs.size() == 1); + // This type is passed via multiple registers in the calling convention. // We need to extract the individual parts. - Register LargeReg = Args[i].Regs[0]; - LLT SmallTy = LLT::scalar(NewVT.getSizeInBits()); - auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg); - assert(Unmerge->getNumOperands() == NumParts + 1); + assert(Args[i].OrigRegs.empty()); + Args[i].OrigRegs.push_back(Args[i].Regs[0]); + ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0]; // We're going to replace the regs and flags with the split ones. Args[i].Regs.clear(); @@ -459,7 +545,9 @@ Flags.setReturned(false); } - Args[i].Regs.push_back(Unmerge.getReg(PartIdx)); + Register NewReg = MRI.createGenericVirtualRegister(NewLLT); + + Args[i].Regs.push_back(NewReg); Args[i].Flags.push_back(Flags); if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i], Args[i].Flags[PartIdx], CCInfo)) @@ -483,7 +571,6 @@ continue; } - EVT OrigVT = EVT::getEVT(Args[i].Ty); EVT VAVT = VA.getValVT(); const LLT OrigTy = getLLTForType(*Args[i].Ty, DL); const LLT VATy(VAVT.getSimpleVT()); @@ -491,12 +578,18 @@ // Expected to be multiple regs for a single incoming arg. // There should be Regs.size() ArgLocs per argument. unsigned NumArgRegs = Args[i].Regs.size(); - MachineRegisterInfo &MRI = MF.getRegInfo(); assert((j + (NumArgRegs - 1)) < ArgLocs.size() && "Too many regs for number of args"); + + // Coerce into outgoing value types before register assignment. + if (!Handler.isIncomingArgumentHandler() && OrigTy != VATy) { + assert(Args[i].OrigRegs.size() == 1); + buildCopyFromParts(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy, + VATy); + } + for (unsigned Part = 0; Part < NumArgRegs; ++Part) { Register ArgReg = Args[i].Regs[Part]; - LLT ArgRegTy = MRI.getType(ArgReg); // There should be Regs.size() ArgLocs per argument. VA = ArgLocs[j + Part]; if (VA.isMemLoc()) { @@ -524,57 +617,16 @@ continue; } - // GlobalISel does not currently work for scalable vectors. - if (OrigVT.getFixedSizeInBits() >= VAVT.getFixedSizeInBits() || - !Handler.isIncomingArgumentHandler()) { - // This is an argument that might have been split. There should be - // Regs.size() ArgLocs per argument. - - // Insert the argument copies. If VAVT < OrigVT, we'll insert the merge - // to the original register after handling all of the parts. - Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA); - continue; - } - - // This ArgLoc covers multiple pieces, so we need to split it. - Register NewReg = MRI.createGenericVirtualRegister(VATy); - Handler.assignValueToReg(NewReg, VA.getLocReg(), VA); - // If it's a vector type, we either need to truncate the elements - // or do an unmerge to get the lower block of elements. - if (VATy.isVector() && - VATy.getNumElements() > OrigVT.getVectorNumElements()) { - // Just handle the case where the VA type is a multiple of original - // type. - if (VATy.getNumElements() % OrigVT.getVectorNumElements() != 0) { - LLVM_DEBUG(dbgs() << "Incoming promoted vector arg elts is not a " - "multiple of orig type elt: " - << VATy << " vs " << OrigTy); - return false; - } - SmallVector DstRegs = {ArgReg}; - unsigned NumParts = - VATy.getNumElements() / OrigVT.getVectorNumElements() - 1; - for (unsigned Idx = 0; Idx < NumParts; ++Idx) - DstRegs.push_back( - MIRBuilder.getMRI()->createGenericVirtualRegister(OrigTy)); - MIRBuilder.buildUnmerge(DstRegs, {NewReg}); - } else if (VATy.getScalarSizeInBits() > ArgRegTy.getScalarSizeInBits()) { - MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0); - } else { - MIRBuilder.buildCopy(ArgReg, NewReg); - } + Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA); } - // Now that all pieces have been handled, re-pack any arguments into any - // wider, original registers. - if (Handler.isIncomingArgumentHandler()) { + // Now that all pieces have been assigned, re-pack the register typed values + // into the original value typed registers. + if (Handler.isIncomingArgumentHandler() && OrigTy != VATy) { // Merge the split registers into the expected larger result vregs of // the original call. - - if (OrigTy != VATy && !Args[i].OrigRegs.empty()) { - buildCopyToParts(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy, - VATy); - } + buildCopyToParts(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy, + VATy); } j += NumArgRegs - 1; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -459,12 +459,6 @@ assert(VRegs.size() == SplitEVTs.size() && "For each split Type there should be exactly one VReg."); - // We pre-process the return value decomposed into EVTs. - SmallVector PreSplitRetInfos; - - // Further processing is applied to split the arguments from PreSplitRetInfos - // into 32-bit pieces in SplitRetInfos before passing off to - // handleAssignments. SmallVector SplitRetInfos; for (unsigned i = 0; i < SplitEVTs.size(); ++i) { @@ -498,18 +492,7 @@ setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); } - splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC); - - // FIXME: This splitting should mostly be done by handleAssignments - processSplitArgs(B, RetInfo, - PreSplitRetInfos, SplitRetInfos, DL, CC, true, - [&](ArrayRef Regs, Register SrcReg, LLT LLTy, - LLT PartLLT, int VTSplitIdx) { - unpackRegsToOrigType(B, Regs, SrcReg, - PreSplitRetInfos[VTSplitIdx], LLTy, - PartLLT); - }); - PreSplitRetInfos.clear(); + splitToValueTypes(B, RetInfo, SplitRetInfos, DL, CC); } CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll @@ -156,9 +156,9 @@ ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p0) :: (load 16 from %ir.ptr) ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128) - ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128) ; CHECK: $x0 = COPY [[UV]](s64) ; CHECK: $x1 = COPY [[UV1]](s64) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128) ; CHECK: $x2 = COPY [[UV2]](s64) ; CHECK: $x3 = COPY [[UV3]](s64) ; CHECK: $x4 = COPY [[COPY]](p0) Index: llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll @@ -19,12 +19,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $w0, $w1, $w2 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w1 - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY $w2 - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY3]](s32), [[COPY5]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) ; CHECK: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) ; CHECK: [[BITCAST:%[0-9]+]]:_(s24) = G_BITCAST [[TRUNC]](<3 x s8>) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[BITCAST]](s24) Index: llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-arguments.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-arguments.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-arguments.ll @@ -27,9 +27,9 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV]](s128) -; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV1]](s128) ; CHECK: $x0 = COPY [[UV]](s64) ; CHECK: $x1 = COPY [[UV1]](s64) +; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[MV1]](s128) ; CHECK: $x2 = COPY [[UV2]](s64) ; CHECK: $x3 = COPY [[UV3]](s64) ; CHECK: BL @use_s128, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3 Index: llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll @@ -17,8 +17,9 @@ ; CHECK: [[COPY6:%[0-9]+]]:_(s64) = COPY $x6 ; CHECK: [[COPY7:%[0-9]+]]:_(s64) = COPY $x7 ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD:%[0-9]+]]:_(s3) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.0, align 16) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s3) + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 4 from %fixed-stack.0, align 16) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s3) = G_TRUNC [[LOAD]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s3) ; CHECK: $w0 = COPY [[ANYEXT]](s32) ; CHECK: RET_ReallyLR implicit $w0 bb: Index: llvm/test/CodeGen/AArch64/GlobalISel/legalize-s128-div.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/legalize-s128-div.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/legalize-s128-div.mir @@ -29,9 +29,9 @@ ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.v2ptr) ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128) - ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128) ; CHECK: $x0 = COPY [[UV]](s64) ; CHECK: $x1 = COPY [[UV1]](s64) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128) ; CHECK: $x2 = COPY [[UV2]](s64) ; CHECK: $x3 = COPY [[UV3]](s64) ; CHECK: BL &__udivti3, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit-def $x0, implicit-def $x1 @@ -70,9 +70,9 @@ ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.v2ptr) ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](s128) - ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128) ; CHECK: $x0 = COPY [[UV]](s64) ; CHECK: $x1 = COPY [[UV1]](s64) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD1]](s128) ; CHECK: $x2 = COPY [[UV2]](s64) ; CHECK: $x3 = COPY [[UV3]](s64) ; CHECK: BL &__divti3, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit-def $x0, implicit-def $x1 Index: llvm/test/CodeGen/AArch64/GlobalISel/ret-1x-vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/ret-1x-vec.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/ret-1x-vec.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s -define <1 x float> @foo(<1 x float> %v) { - ; CHECK-LABEL: name: foo +define <1 x float> @ret_v1f32(<1 x float> %v) { + ; CHECK-LABEL: name: ret_v1f32 ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $d0 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 @@ -13,3 +13,23 @@ ; CHECK: RET_ReallyLR implicit $d0 ret <1 x float> %v } + +define <1 x i8*> @ret_v1p0(<1 x i8*> %v) { + ; CHECK-LABEL: name: ret_v1p0 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $d0 + ; CHECK: $d0 = COPY [[COPY]](p0) + ; CHECK: RET_ReallyLR implicit $d0 + ret <1 x i8*> %v +} + +define <1 x i8 addrspace(1)*> @ret_v1p1(<1 x i8 addrspace(1)*> %v) { + ; CHECK-LABEL: name: ret_v1p1 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $d0 + ; CHECK: $d0 = COPY [[COPY]](p1) + ; CHECK: RET_ReallyLR implicit $d0 + ret <1 x i8 addrspace(1)*> %v +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -363,20 +363,12 @@ } define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GFX6-LABEL: v_andn2_v2i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_andn2_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_andn2_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 ret <2 x i16> %and Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -473,13 +473,18 @@ ; GFX6-LABEL: v_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v3, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16: @@ -504,10 +509,15 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16_15: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -467,15 +467,18 @@ ; GFX7-LABEL: v_bswap_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_v2i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -81,12 +81,17 @@ ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) - ; CHECK: $vgpr0 = COPY [[COPY6]](s32) - ; CHECK: $vgpr1 = COPY [[COPY7]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; CHECK: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0, implicit $vgpr1 + ; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0 %add = add <2 x i16> %arg0, %arg0 ret <2 x i16> %add } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -408,34 +408,38 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16: @@ -456,26 +460,30 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16: @@ -533,14 +541,18 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: @@ -575,34 +587,38 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25: @@ -623,26 +639,30 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_ulp25: @@ -699,6 +719,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 @@ -708,22 +729,25 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: @@ -745,25 +769,29 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16: @@ -816,6 +844,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 @@ -825,22 +854,25 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: @@ -862,25 +894,29 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: @@ -935,11 +971,15 @@ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v3 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: @@ -971,6 +1011,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 @@ -980,22 +1021,25 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: @@ -1017,25 +1061,29 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_ulp25: @@ -1068,14 +1116,18 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1110,34 +1162,38 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v3, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v6 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v6, v1, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, v6 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1158,26 +1214,30 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v5 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v5, v1, v5 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v5 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1235,14 +1295,18 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -93,6 +93,10 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16: @@ -135,6 +139,10 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs: @@ -179,6 +187,10 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_rhs: @@ -225,6 +237,10 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -123,6 +123,10 @@ ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16: @@ -186,6 +190,10 @@ ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -252,6 +260,10 @@ ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: @@ -320,6 +332,10 @@ ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -78,8 +78,9 @@ ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1) ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32) ; CHECK: $vgpr1 = COPY [[LOAD2]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) @@ -106,8 +107,9 @@ ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1) ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: $sgpr4 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $sgpr4 = COPY [[ANYEXT1]](s32) ; CHECK: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -513,8 +513,9 @@ ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s8) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[C]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -3612,17 +3613,19 @@ ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C3]](s32) ; CHECK: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store 4 into stack, align 16, addrspace 5) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD2]](s8) ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C4]](s32) - ; CHECK: G_STORE [[LOAD2]](s8), [[PTR_ADD2]](p5) :: (store 1 into stack + 4, align 4, addrspace 5) + ; CHECK: G_STORE [[ANYEXT]](s16), [[PTR_ADD2]](p5) :: (store 2 into stack + 4, align 4, addrspace 5) + ; CHECK: [[COPY22:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16) ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C5]](s32) - ; CHECK: G_STORE [[LOAD2]](s8), [[PTR_ADD3]](p5) :: (store 1 into stack + 8, align 8, addrspace 5) + ; CHECK: G_STORE [[COPY22]](s16), [[PTR_ADD3]](p5) :: (store 2 into stack + 8, align 8, addrspace 5) ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C6]](s32) ; CHECK: G_STORE [[LOAD3]](s16), [[PTR_ADD4]](p5) :: (store 2 into stack + 12, align 4, addrspace 5) - ; CHECK: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>) + ; CHECK: [[COPY23:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY23]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY11]](p4) ; CHECK: $sgpr6_sgpr7 = COPY [[COPY12]](p4) ; CHECK: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) @@ -3787,8 +3790,9 @@ ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32) ; CHECK: $vgpr1 = COPY [[LOAD2]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) @@ -3822,8 +3826,9 @@ ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1) ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32) ; CHECK: $vgpr1 = COPY [[LOAD2]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) @@ -3850,8 +3855,9 @@ ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1) ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: $sgpr4 = COPY [[ANYEXT]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK: $sgpr4 = COPY [[ANYEXT1]](s32) ; CHECK: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -1867,7 +1867,8 @@ ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 ; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5) ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 1 from %fixed-stack.2, align 4, addrspace 5) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 2 from %fixed-stack.2, align 4, addrspace 5) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16) ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5) ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 @@ -1880,7 +1881,7 @@ ; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[LOAD]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD1]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[TRUNC]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1) ; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -491,6 +491,8 @@ ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16: @@ -515,9 +517,10 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16_15: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -363,20 +363,12 @@ } define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GFX6-LABEL: v_orn2_v2i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_orn2_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_orn2_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 ret <2 x i16> %or Index: llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -178,11 +178,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_rndne_f32_e32 v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16: @@ -190,11 +194,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_rndne_f32_e32 v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16: @@ -226,11 +234,15 @@ ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_rndne_f32_e32 v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16_fneg: @@ -239,11 +251,15 @@ ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_rndne_f32_e32 v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16_fneg: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2724,8 +2724,13 @@ ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -485,11 +485,16 @@ ; GFX6-LABEL: v_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16: @@ -515,7 +520,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16_15: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2704,14 +2704,19 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -1760,8 +1760,10 @@ ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1672,8 +1672,10 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: Index: llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp =================================================================== --- llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -1988,9 +1988,9 @@ CHECK: $x1 = COPY [[COPY]] CHECK: BL &__moddi3 CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] - CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] CHECK: $x0 = COPY [[UV]] CHECK: $x1 = COPY [[UV1]] + CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] CHECK: $x2 = COPY [[UV2]] CHECK: $x3 = COPY [[UV3]] CHECK: BL &__modti3 @@ -2045,9 +2045,9 @@ CHECK: $x1 = COPY [[COPY]] CHECK: BL &__umoddi3 CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] - CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] CHECK: $x0 = COPY [[UV]] CHECK: $x1 = COPY [[UV1]] + CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[ANYEXT]] CHECK: $x2 = COPY [[UV2]] CHECK: $x3 = COPY [[UV3]] CHECK: BL &__umodti3