Index: llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -165,6 +165,21 @@ MachinePointerInfo &MPO, ISD::ArgFlagsTy Flags) = 0; + /// Return the in-memory size to write for the argument at \p VA. This may + /// be smaller than the allocated stack slot size. + /// + /// This is overridable primarily for targets to maintain compatibility with + /// hacks around the existing DAG call lowering infrastructure. + virtual uint64_t getStackValueStoreSize(const CCValAssign &VA) const; + + virtual Register getStackAddress(const CCValAssign &VA, + MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) { + uint64_t MemSize = + Flags.isByVal() ? Flags.getByValSize() : VA.getLocVT().getStoreSize(); + return getStackAddress(MemSize, VA.getLocMemOffset(), MPO, Flags); + } + /// The specified value has been assigned to a physical register, /// handle the appropriate COPY (either to or from) and mark any /// relevant uses/defines as needed. @@ -212,7 +227,11 @@ Register extendRegister(Register ValReg, CCValAssign &VA, unsigned MaxSizeBits = 0); - virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + /// Wrap call to (typically tablegenerated CCAssignFn. This may be + /// overridden to track additional state information as arguments are + /// assigned or apply target specific hacks around the legacy + /// infrastructure. + virtual bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) { return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -306,16 +306,39 @@ /// value registers of type \p LLTy, and \p Regs contains the legalized pieces /// with type \p PartLLT. This is used for incoming values (physregs to vregs). static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef OrigRegs, - ArrayRef Regs, LLT LLTy, LLT PartLLT) { + ArrayRef Regs, LLT LLTy, LLT PartLLT, + /*LLT LocTy, */ const ISD::ArgFlagsTy Flags) { MachineRegisterInfo &MRI = *B.getMRI(); - // We could just insert a regular copy, but this is unreachable at the moment. - assert(LLTy != PartLLT && "identical part types shouldn't reach here"); + if (PartLLT == LLTy) { + // We should have avoided introducing a new virtual register, and just + // directly assigned here. + assert(OrigRegs[0] == Regs[0]); + return; + } + + if (PartLLT.getSizeInBits() == LLTy.getSizeInBits() && OrigRegs.size() == 1 && + Regs.size() == 1) { + B.buildBitcast(OrigRegs[0], Regs[0]); + return; + } if (PartLLT.isVector() == LLTy.isVector() && - PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits()) { - assert(OrigRegs.size() == 1 && Regs.size() == 1); - B.buildTrunc(OrigRegs[0], Regs[0]); + PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits() && + OrigRegs.size() == 1 && Regs.size() == 1) { + Register SrcReg = Regs[0]; + + LLT LocTy = MRI.getType(SrcReg); + + if (Flags.isSExt()) { + SrcReg = B.buildAssertSExt(LocTy, SrcReg, + LLTy.getScalarSizeInBits()).getReg(0); + } else if (Flags.isZExt()) { + SrcReg = B.buildAssertZExt(LocTy, SrcReg, + LLTy.getScalarSizeInBits()).getReg(0); + } + + B.buildTrunc(OrigRegs[0], SrcReg); return; } @@ -335,9 +358,23 @@ } if (PartLLT.isVector()) { - assert(OrigRegs.size() == 1 && - LLTy.getScalarType() == PartLLT.getElementType()); - mergeVectorRegsToResultRegs(B, OrigRegs, Regs); + assert(OrigRegs.size() == 1); + + if (LLTy.getScalarType() == PartLLT.getElementType()) { + mergeVectorRegsToResultRegs(B, OrigRegs, Regs); + } else { + SmallVector CastRegs(Regs.size()); + unsigned I = 0; + LLT GCDTy = getGCDType(LLTy, PartLLT); + + // We are both splitting a vector, and bitcasting its element types. Cast + // the source pieces into the appropriate number of pieces with the result + // element type. + for (Register SrcReg : Regs) + CastRegs[I++] = B.buildBitcast(GCDTy, SrcReg).getReg(0); + mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs); + } + return; } @@ -487,14 +524,9 @@ unsigned NumArgs = Args.size(); for (unsigned i = 0; i != NumArgs; ++i) { EVT CurVT = EVT::getEVT(Args[i].Ty); - if (CurVT.isSimple() && - !Handler.assignArg(i, CurVT.getSimpleVT(), CurVT.getSimpleVT(), - CCValAssign::Full, Args[i], Args[i].Flags[0], - CCInfo)) - continue; MVT NewVT = TLI->getRegisterTypeForCallingConv( - F.getContext(), CCInfo.getCallingConv(), EVT(CurVT)); + F.getContext(), CCInfo.getCallingConv(), CurVT); // If we need to split the type over multiple regs, check it's a scenario // we currently support. @@ -503,7 +535,7 @@ if (NumParts == 1) { // Try to use the register type if we couldn't assign the VT. - if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i], + if (Handler.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i], Args[i].Flags[0], CCInfo)) return false; continue; @@ -544,7 +576,7 @@ } Args[i].Flags.push_back(Flags); - if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i], + if (Handler.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i], Args[i].Flags[Part], CCInfo)) { // Still couldn't assign this smaller part type for some reason. return false; @@ -566,8 +598,15 @@ continue; } - const EVT VAVT = VA.getValVT(); - const LLT NewLLT(VAVT.getSimpleVT()); + const MVT VAVT = VA.getValVT(); + const MVT ValVT = VA.getValVT(); + const MVT LocVT = VA.getLocVT(); + + const LLT LocTy(LocVT); + const LLT VATy(VAVT); + const LLT ValTy(ValVT); + const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : VATy; + const EVT OrigVT = EVT::getEVT(Args[i].Ty); const LLT OrigTy = getLLTForType(*Args[i].Ty, DL); // Expected to be multiple regs for a single incoming arg. @@ -590,8 +629,6 @@ Args[i].Regs[Part] = MRI.createGenericVirtualRegister(NewLLT); } - const LLT VATy(VAVT.getSimpleVT()); - assert((j + (NumParts - 1)) < ArgLocs.size() && "Too many regs for number of args"); @@ -612,14 +649,14 @@ // Individual pieces may have been spilled to the stack and others // passed in registers. - // FIXME: Use correct address space for pointer size - EVT LocVT = VA.getValVT(); - unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize() - : LocVT.getStoreSize(); - unsigned Offset = VA.getLocMemOffset(); + // TODO: The memory size may be larger than the value we need to + // store. We may need to adjust the offset for big endian targets. + uint64_t MemSize = Handler.getStackValueStoreSize(VA); + MachinePointerInfo MPO; Register StackAddr = - Handler.getStackAddress(MemSize, Offset, MPO, Flags); + Handler.getStackAddress(MemSize, VA.getLocMemOffset(), MPO, Flags); + Handler.assignValueToAddress(Args[i], Part, StackAddr, MemSize, MPO, VA); continue; @@ -681,11 +718,11 @@ // Now that all pieces have been assigned, re-pack the register typed values // into the original value typed registers. - if (Handler.isIncomingArgumentHandler() && OrigTy != VATy) { + if (Handler.isIncomingArgumentHandler() && OrigVT != LocVT) { // Merge the split registers into the expected larger result vregs of // the original call. buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy, - VATy); + LocTy, Args[i].Flags[0]); } j += NumParts - 1; @@ -969,6 +1006,18 @@ return true; } +uint64_t CallLowering::ValueHandler::getStackValueStoreSize( + const CCValAssign &VA) const { + const EVT ValVT = VA.getValVT(); + if (ValVT != MVT::iPTR) + return ValVT.getStoreSize(); + + const DataLayout &DL = MIRBuilder.getDataLayout(); + + /// FIXME: We need to get the correct pointer address space. + return DL.getPointerSize(); +} + void CallLowering::ValueHandler::copyArgumentMemory( const ArgInfo &Arg, Register DstPtr, Register SrcPtr, const MachinePointerInfo &DstPtrInfo, Align DstAlign, @@ -996,7 +1045,8 @@ CCValAssign &VA, unsigned MaxSizeBits) { LLT LocTy{VA.getLocVT()}; - LLT ValTy = MRI.getType(ValReg); + LLT ValTy{VA.getValVT()}; + if (LocTy.getSizeInBits() == ValTy.getSizeInBits()) return ValReg; @@ -1055,18 +1105,39 @@ } } +/// Check if we can use a basic COPY instruction between the two types. +/// +/// We're currently building on top of the infrastructure using MVT, which loses +/// pointer information in the CCValAssign. We accept copies from physical +/// registers that have been reported as integers if it's to an equivalent sized +/// pointer LLT. +static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) { + if (SrcTy == DstTy) + return true; + + if (SrcTy.getSizeInBits() != DstTy.getSizeInBits()) + return false; + + SrcTy = SrcTy.getScalarType(); + DstTy = DstTy.getScalarType(); + + return (SrcTy.isPointer() && DstTy.isScalar()) || + (DstTy.isScalar() && SrcTy.isPointer()); +} + void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) { - const LLT LocTy(VA.getLocVT()); - const LLT ValTy = MRI.getType(ValVReg); + const MVT LocVT = VA.getLocVT(); + const LLT LocTy(LocVT); + const LLT RegTy = MRI.getType(ValVReg); - if (ValTy.getSizeInBits() == LocTy.getSizeInBits()) { + if (isCopyCompatibleType(RegTy, LocTy)) { MIRBuilder.buildCopy(ValVReg, PhysReg); return; } auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg); - auto Hint = buildExtensionHint(VA, Copy.getReg(0), ValTy); + auto Hint = buildExtensionHint(VA, Copy.getReg(0), RegTy); MIRBuilder.buildTrunc(ValVReg, Hint); } Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -51,6 +51,20 @@ AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) : CallLowering(&TLI) {} +static void applyStackPassedSmallTypeDAGHack(EVT OrigVT, MVT &ValVT, + MVT &LocVT) { + // If ValVT is i1/i8/i16, we should set LocVT to i8/i8/i16. This is a legacy + // hack because the DAG calls the assignment function with pre-legalized + // register typed values, not the raw type. + // + // This hack is not applied to return values which are not passed on the + // stack. + if (OrigVT == MVT::i1 || OrigVT == MVT::i8) + ValVT = LocVT = MVT::i8; + else if (OrigVT == MVT::i16) + ValVT = LocVT = MVT::i16; +} + namespace { struct IncomingArgHandler : public CallLowering::IncomingValueHandler { IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, @@ -73,6 +87,14 @@ return AddrReg.getReg(0); } + uint64_t getStackValueStoreSize(const CCValAssign &VA) const override { + // Account for the hack applied for stack passed i1/i8/i16 values. + const MVT ValVT = VA.getValVT(); + return (ValVT == MVT::i8 || ValVT == MVT::i16) + ? ValVT.getStoreSize() + : VA.getLocVT().getStoreSize(); + } + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); @@ -84,40 +106,36 @@ MachineFunction &MF = MIRBuilder.getMF(); // The reported memory location may be wider than the value. - const LLT RegTy = MRI.getType(ValVReg); - MemSize = std::min(static_cast(RegTy.getSizeInBytes()), MemSize); + const LLT RealRegTy = MRI.getType(ValVReg); + LLT ValTy(VA.getValVT()); + LLT LocTy(VA.getLocVT()); + + // Fixup the types for the DAG compatibility hack. + if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) + std::swap(ValTy, LocTy); + + MemSize = LocTy.getSizeInBytes(); auto MMO = MF.getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, inferAlignFromPtrInfo(MF, MPO)); - const LLT LocVT = LLT{VA.getLocVT()}; - - if (RegTy.getScalarSizeInBits() < LocVT.getScalarSizeInBits()) { - auto LocInfo = VA.getLocInfo(); - if (LocInfo == CCValAssign::LocInfo::ZExt) { - // We know the parameter is zero-extended. Perform a load into LocVT, - // and use G_ASSERT_ZEXT to communicate that this was zero-extended from - // the parameter type. Move down to the parameter type using G_TRUNC. - MIRBuilder.buildTrunc( - ValVReg, MIRBuilder.buildAssertZExt( - LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO), - RegTy.getScalarSizeInBits())); - return; - } - if (LocInfo == CCValAssign::LocInfo::SExt) { - // Same as the ZExt case, but use G_ASSERT_SEXT instead. - MIRBuilder.buildTrunc( - ValVReg, MIRBuilder.buildAssertSExt( - LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO), - RegTy.getScalarSizeInBits())); - return; - } + if (RealRegTy.getSizeInBits() == ValTy.getSizeInBits()) { + // No extension information, or no extension necessary. Load into the + // incoming parameter type directly. + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } else { + auto Tmp = MIRBuilder.buildLoad(LocTy, Addr, *MMO); + MIRBuilder.buildTrunc(ValVReg, Tmp); } + } - // No extension information, or no extension necessary. Load into the - // incoming parameter type directly. - MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); + return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); } /// How the physical register gets marked varies between formal @@ -164,11 +182,11 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn, - CCAssignFn *AssignFnVarArg, bool IsTailCall = false, - int FPDiff = 0) + CCAssignFn *AssignFnVarArg, bool IsReturn, + bool IsTailCall = false, int FPDiff = 0) : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), - AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), - StackSize(0), SPReg(0), + AssignFnVarArg(AssignFnVarArg), IsReturn(IsReturn), + IsTailCall(IsTailCall), FPDiff(FPDiff), StackSize(0), SPReg(0), Subtarget(MIRBuilder.getMF().getSubtarget()) {} Register getStackAddress(uint64_t Size, int64_t Offset, @@ -199,6 +217,34 @@ return AddrReg.getReg(0); } + Register getStackAddress(const CCValAssign &VA, MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) override { + if (Flags.isByVal()) { + return getStackAddress(Flags.getByValSize(), VA.getLocMemOffset(), MPO, + Flags); + } + + // We abuse the legacy DAG based CCAssignFns for i1/i8/i16 values, and ValVT + // and LocVT end up having the opposite meanings. + + const MVT ValVT = VA.getValVT(); + uint64_t MemSize = (ValVT == MVT::i8 || ValVT == MVT::i16) + ? ValVT.getStoreSize() + : VA.getLocVT().getStoreSize(); + return getStackAddress(MemSize, VA.getLocMemOffset(), MPO, Flags); + } + + /// We need to fixup the reported store size for certain value types because + /// we invert the interpretation of ValVT and LocVT in certain cases. This is + /// for compatability with the DAG call lowering implementation, which we're + /// currently building on top of. + uint64_t getStackValueStoreSize(const CCValAssign &VA) const override { + const MVT ValVT = VA.getValVT(); + return (ValVT == MVT::i8 || ValVT == MVT::i16) + ? ValVT.getStoreSize() + : VA.getLocVT().getStoreSize(); + } + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); @@ -215,34 +261,48 @@ } void assignValueToAddress(const CallLowering::ArgInfo &Arg, unsigned RegIndex, - Register Addr, uint64_t Size, + Register Addr, uint64_t MemSize, MachinePointerInfo &MPO, CCValAssign &VA) override { - unsigned MaxSize = Size * 8; + unsigned MaxSize = MemSize * 8; // For varargs, we always want to extend them to 8 bytes, in which case // we disable setting a max. if (!Arg.IsFixed) MaxSize = 0; - Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt - ? extendRegister(Arg.Regs[RegIndex], VA, MaxSize) - : Arg.Regs[0]; + Register ValVReg = Arg.Regs[RegIndex]; + if (VA.getLocInfo() != CCValAssign::LocInfo::FPExt) { + MVT LocVT = VA.getLocVT(); + MVT ValVT = VA.getValVT(); + + if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) { + std::swap(ValVT, LocVT); + MemSize = VA.getValVT().getStoreSize(); + } + + ValVReg = extendRegister(ValVReg, VA, MaxSize); + const LLT RegTy = MRI.getType(ValVReg); - // If we extended we might need to adjust the MMO's Size. - const LLT RegTy = MRI.getType(ValVReg); - if (RegTy.getSizeInBytes() > Size) - Size = RegTy.getSizeInBytes(); + if (RegTy.getSizeInBits() < LocVT.getSizeInBits()) + ValVReg = MIRBuilder.buildTrunc(RegTy, ValVReg).getReg(0); + } else { + // The store does not cover the full allocated stack slot. + MemSize = VA.getValVT().getStoreSize(); + } - assignValueToAddress(ValVReg, Addr, Size, MPO, VA); + assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA); } - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, - ISD::ArgFlagsTy Flags, + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - bool Res; bool IsCalleeWin = Subtarget.isCallingConvWin64(State.getCallingConv()); bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg(); + + if (!State.isVarArg() && !UseVarArgsCCForFixed && !IsReturn) + applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); + + bool Res; if (Info.IsFixed && !UseVarArgsCCForFixed) Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); else @@ -254,6 +314,11 @@ MachineInstrBuilder MIB; CCAssignFn *AssignFnVarArg; + + /// Track if this is used for a return instead of function argument + /// passing. We apply a hack to i1/i8/i16 stack passed values, but do not use + /// stack passed returns for them and cannot apply the type adjustment. + bool IsReturn; bool IsTailCall; /// For tail calls, the byte offset of the call's argument area from the @@ -381,7 +446,8 @@ splitToValueTypes(CurArgInfo, SplitArgs, DL, CC); } - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn, + /*IsReturn*/ true); Success = handleAssignments(MIRBuilder, SplitArgs, Handler, CC, F.isVarArg()); } @@ -885,7 +951,8 @@ // Do the actual argument marshalling. OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg, true, FPDiff); + AssignFnVarArg, /*IsReturn*/ false, + /*IsTailCall*/ true, FPDiff); if (!handleAssignments(MIRBuilder, OutArgs, Handler, CalleeCC, Info.IsVarArg)) return false; @@ -997,7 +1064,7 @@ // Do the actual argument marshalling. OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg, false); + AssignFnVarArg, /*IsReturn*/ false); if (!handleAssignments(MIRBuilder, OutArgs, Handler, Info.CallConv, Info.IsVarArg)) return false; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -79,10 +79,9 @@ MIB.addUse(PhysReg, RegState::Implicit); } - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, - ISD::ArgFlagsTy Flags, + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); } Index: llvm/lib/Target/ARM/ARMCallLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMCallLowering.cpp +++ llvm/lib/Target/ARM/ARMCallLowering.cpp @@ -169,7 +169,7 @@ return 1; } - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { Index: llvm/lib/Target/X86/X86CallLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86CallLowering.cpp +++ llvm/lib/Target/X86/X86CallLowering.cpp @@ -138,7 +138,7 @@ MIRBuilder.buildStore(ExtReg, Addr, *MMO); } - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll @@ -20,19 +20,21 @@ ; CHECK: $w0 = COPY [[C]](s32) ; CHECK: $d0 = COPY [[C1]](s64) ; CHECK: $x1 = COPY [[C2]](s64) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C3]](s8) ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s8) - ; CHECK: G_STORE [[ANYEXT]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT]](s32) + ; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[C4]](s16) ; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s16) - ; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT2]](s32) + ; CHECK: G_STORE [[ANYEXT3]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1) ; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32) - ; CHECK: G_STORE [[ANYEXT2]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1) + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32) + ; CHECK: G_STORE [[ANYEXT4]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1) ; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64) ; CHECK: G_STORE [[C6]](s32), [[PTR_ADD3]](p0) :: (store 4 into stack + 24, align 1) Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll @@ -224,3 +224,63 @@ ret i32 %conv } +define void @arg_v2i64(<2 x i64> %arg) { + ; CHECK-LABEL: name: arg_v2i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[COPY]](<2 x s64>), [[DEF]](p0) :: (store 16 into `<2 x i64>* undef`) + ; CHECK: RET_ReallyLR + store <2 x i64> %arg, <2 x i64>* undef + ret void +} + +define void @arg_v8i64(<8 x i64> %arg) { + ; CHECK-LABEL: name: arg_v8i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0, $q1, $q2, $q3 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s64>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>), [[COPY3]](<2 x s64>) + ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[CONCAT_VECTORS]](<8 x s64>), [[DEF]](p0) :: (store 64 into `<8 x i64>* undef`) + ; CHECK: RET_ReallyLR + store <8 x i64> %arg, <8 x i64>* undef + ret void +} + +define void @arg_v4f32(<4 x float> %arg) { + ; CHECK-LABEL: name: arg_v4f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[BITCAST]](<4 x s32>), [[DEF]](p0) :: (store 16 into `<4 x float>* undef`) + ; CHECK: RET_ReallyLR + store <4 x float> %arg, <4 x float>* undef + ret void +} + +define void @ret_arg_v16f32(<16 x float> %arg) { + ; CHECK-LABEL: name: ret_arg_v16f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $q0, $q1, $q2, $q3 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY2]](<2 x s64>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY3]](<2 x s64>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[BITCAST1]](<4 x s32>), [[BITCAST2]](<4 x s32>), [[BITCAST3]](<4 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[CONCAT_VECTORS]](<16 x s32>), [[DEF]](p0) :: (store 64 into `<16 x float>* undef`) + ; CHECK: RET_ReallyLR + store <16 x float> %arg, <16 x float>* undef + ret void +} Index: llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -2353,8 +2353,9 @@ define void @test_i1_arg_zext(void (i1)* %f) { ; CHECK-LABEL: name: test_i1_arg_zext ; CHECK: [[I1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true -; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[I1]](s1) -; CHECK: $w0 = COPY [[ZEXT]](s32) +; CHECK: [[ZEXT0:%[0-9]+]]:_(s8) = G_ZEXT [[I1]](s1) +; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT0]](s8) +; CHECK: $w0 = COPY [[ZEXT1]](s32) call void %f(i1 true) ret void } Index: llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll @@ -253,7 +253,8 @@ ; CHECK-NEXT: - { id: [[SLOT:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16, stack-id: default, ; CHECK-NEXT: isImmutable: true, ; CHECK: [[ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[SLOT]] -; CHECK: {{%[0-9]+}}:_(s1) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16) +; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16) +; CHECK-NEXT: {{%[0-9]+}}:_(s1) = G_TRUNC [[LOAD]] define void @test_mem_i1([8 x i64], i1 %in) { ret void } Index: llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll @@ -9,8 +9,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $q1, $s0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[COPY1]](<4 x s32>) + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) + ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[BITCAST]](<4 x s32>) ; CHECK: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32) ; CHECK: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec) @@ -22,8 +23,9 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $q1, $s0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[COPY1]](<4 x s32>) + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) + ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[BITCAST]](<4 x s32>) ; CHECK: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]] ; CHECK: $s0 = COPY [[FADD]](s32) ; CHECK: RET_ReallyLR implicit $s0 @@ -69,8 +71,9 @@ ; CHECK-LABEL: name: fmax ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[COPY]](<4 x s32>) + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[BITCAST]](<4 x s32>) ; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32) ; CHECK: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec) @@ -81,8 +84,9 @@ ; CHECK-LABEL: name: fmin ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[COPY]](<4 x s32>) + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) ; CHECK: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) @@ -93,8 +97,9 @@ ; CHECK-LABEL: name: fmin_nnan ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[COPY]](<4 x s32>) + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) ; CHECK: RET_ReallyLR implicit $s0 %res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -429,8 +429,15 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16: @@ -451,8 +458,15 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_commute: @@ -473,8 +487,15 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s1 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_use: @@ -501,9 +522,19 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s4, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s1 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s3, s4, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_and_b32 s1, s6, s1 +; GFX6-NEXT: s_or_b32 s1, s3, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -529,12 +560,27 @@ } define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GCN-LABEL: v_andn2_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 -; GCN-NEXT: v_and_b32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_andn2_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_andn2_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_andn2_v2i16: ; GFX10: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -723,17 +723,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0 -; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16: @@ -764,16 +759,10 @@ ; GFX6-LABEL: v_ashr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16_15: @@ -805,13 +794,12 @@ ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s4 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, s1 -; GFX6-NEXT: s_sext_i32_i16 s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, s3 +; GFX6-NEXT: s_ashr_i32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 +; GFX6-NEXT: s_ashr_i32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -863,11 +851,10 @@ ; GFX6-LABEL: ashr_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 @@ -902,14 +889,13 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v1, s1, v1 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -459,18 +459,19 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_and_b32 s3, s0, 0xffff +; GFX7-NEXT: s_mov_b32 s3, 0xffff ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_lshl_b32 s1, s1, 8 -; GFX7-NEXT: s_lshr_b32 s0, s0, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_lshr_b32 s3, s3, 8 +; GFX7-NEXT: s_and_b32 s0, s0, s3 +; GFX7-NEXT: s_lshr_b32 s0, s0, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_lshl_b32 s2, s1, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s3 +; GFX7-NEXT: s_lshr_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX7-NEXT: s_or_b32 s2, s3, s2 -; GFX7-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_bswap_v2i16: @@ -578,18 +579,15 @@ ; GFX7-LABEL: v_bswap_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_v2i16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -67,31 +67,22 @@ define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 { ; CHECK-LABEL: name: halfinsts_add_v2i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]] - ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] - ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) - ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) - ; CHECK: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0 + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY7]](s32) + ; CHECK: $vgpr1 = COPY [[COPY8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1 %add = add <2 x i16> %arg0, %arg0 ret <2 x i16> %add } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -499,15 +499,13 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -515,36 +513,30 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -553,30 +545,26 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16: @@ -652,22 +640,16 @@ ; GFX6-LABEL: v_fdiv_v2f16_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: @@ -711,15 +693,13 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -727,36 +707,30 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -765,30 +739,26 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_ulp25: @@ -864,13 +834,12 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -878,11 +847,11 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -890,24 +859,19 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 @@ -918,13 +882,13 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 @@ -934,12 +898,8 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16: @@ -1009,13 +969,12 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -1023,11 +982,11 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -1035,24 +994,19 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 @@ -1063,13 +1017,13 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 @@ -1079,12 +1033,8 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: @@ -1154,20 +1104,15 @@ ; GFX6-LABEL: v_rcp_v2f16_arcp_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: @@ -1205,13 +1150,12 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -1219,11 +1163,11 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 @@ -1231,24 +1175,19 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 @@ -1259,13 +1198,13 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 @@ -1275,12 +1214,8 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_ulp25: @@ -1318,22 +1253,16 @@ ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1377,15 +1306,13 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -1393,36 +1320,30 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -1431,30 +1352,26 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16 -; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1530,22 +1447,16 @@ ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -104,23 +104,16 @@ ; GFX6-LABEL: v_fma_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16: @@ -156,24 +149,21 @@ ; GFX6-LABEL: v_fma_v2f16_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs: @@ -211,24 +201,21 @@ ; GFX6-LABEL: v_fma_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_rhs: @@ -266,26 +253,28 @@ ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX6-NEXT: s_mov_b32 s4, 0x80008000 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 +; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -143,24 +143,18 @@ ; GFX6-LABEL: v_pow_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16: @@ -228,25 +222,23 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_exp_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -318,25 +310,23 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: @@ -408,8 +398,15 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_mov_b32 s4, 0x80008000 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -425,10 +422,6 @@ ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3235,25 +3235,25 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s5, s2, 15 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_lshr_b32 s3, s0, 16 -; GFX6-NEXT: s_lshr_b32 s4, s2, 16 -; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_and_b32 s5, s1, 0xffff -; GFX6-NEXT: s_lshr_b32 s5, s5, 1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s5, s2 +; GFX6-NEXT: s_and_b32 s6, s4, 15 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 +; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s6 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 15 +; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_lshl_b32 s2, s3, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 17 +; GFX6-NEXT: s_lshl_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_andn2_b32 s4, 15, s5 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX6-NEXT: s_lshr_b32 s1, s1, s3 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshr_b32 s2, s2, s3 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3349,31 +3349,27 @@ ; GFX6-LABEL: v_fshl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16: @@ -3434,24 +3430,21 @@ ; GFX6-LABEL: v_fshl_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: s_bfe_u32 s5, 11, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16_4_8: @@ -3570,23 +3563,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s0 +; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX6-NEXT: s_and_b32 s0, s3, s0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s1, 17 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 @@ -3660,29 +3653,29 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshl_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX6-NEXT: s_and_b32 s4, s1, 15 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_andn2_b32 s1, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 17, v0 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s2, s0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_lshl_b32 s0, s1, s0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_v2i16_svs: @@ -3746,22 +3739,22 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshl_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s3, s1, 15 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s2, s1, 16 -; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_and_b32 s3, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s3, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_lshr_b32 s1, s3, s1 -; GFX6-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_and_b32 s1, s2, 15 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_and_b32 s4, s2, 15 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 17 +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, s2 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s3, 15 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_and_b32 s0, s1, s4 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3066,41 +3066,43 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s0, 16 -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s6, s1, s5 -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 -; GFX6-NEXT: s_lshr_b32 s4, s1, 17 -; GFX6-NEXT: s_lshr_b32 s6, s6, 1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s7 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 -; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_lshr_b32 s4, s1, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: s_and_b32 s7, s2, 15 -; GFX6-NEXT: s_and_b32 s1, s1, s5 -; GFX6-NEXT: s_or_b32 s0, s0, s6 -; GFX6-NEXT: s_lshr_b32 s6, s2, 16 -; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_and_b32 s4, s4, s6 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s7, s2, s6 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_lshl_b32 s1, s1, s5 +; GFX6-NEXT: s_and_b32 s5, s3, s6 +; GFX6-NEXT: s_lshr_b32 s7, s7, 1 +; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_lshr_b32 s7, s7, s8 +; GFX6-NEXT: s_lshr_b32 s5, s5, s8 +; GFX6-NEXT: s_xor_b32 s4, s4, -1 +; GFX6-NEXT: s_and_b32 s2, s2, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s7 +; GFX6-NEXT: s_and_b32 s7, s4, 15 +; GFX6-NEXT: s_or_b32 s1, s1, s5 +; GFX6-NEXT: s_lshr_b32 s5, s4, 16 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_lshr_b32 s1, s1, 1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s7 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s6, 15 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_andn2_b32 s2, 15, s6 -; GFX6-NEXT: s_lshl_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s3, s4, s5 -; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s3, s2 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_lshl_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_andn2_b32 s4, 15, s5 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -3209,48 +3211,46 @@ ; GFX6-LABEL: v_fshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s5, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v5, s5, v2 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 17, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v5, s5, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s5, v2 ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 15, v5 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s5, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16: @@ -3324,28 +3324,25 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_sub_i32 s4, 0, 4 ; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: s_mov_b32 s6, 0xffff ; GFX6-NEXT: s_xor_b32 s4, s4, -1 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_sub_i32 s5, 0, 8 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: s_and_b32 s4, s5, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_xor_b32 s5, s5, -1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16_4_8: @@ -3468,42 +3465,44 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshr_v2i16_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s5, s1, s4 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s6, s2, s5 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_lshr_b32 s5, s5, 1 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 -; GFX6-NEXT: s_lshr_b32 s3, s1, 17 +; GFX6-NEXT: s_lshr_b32 s6, s6, 1 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_and_b32 s0, s1, s4 +; GFX6-NEXT: s_and_b32 s0, s2, s5 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s3, s5 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_and_b32 s0, s3, s4 +; GFX6-NEXT: s_and_b32 s0, s3, s5 +; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 @@ -3590,41 +3589,43 @@ ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v0 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_lshl_b32 s0, s2, s3 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, -1 +; GFX6-NEXT: s_lshl_b32 s0, s1, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 +; GFX6-NEXT: s_xor_b32 s0, s2, -1 ; GFX6-NEXT: s_and_b32 s2, s0, 15 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3708,40 +3709,42 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s4, s0, s3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s0, 17 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s2, s2, s5 -; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s5, s0, s4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 +; GFX6-NEXT: s_and_b32 s3, s1, s4 +; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s5, s1, 15 -; GFX6-NEXT: s_and_b32 s0, s0, s3 -; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_lshr_b32 s4, s1, 16 -; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s4, 15 +; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: s_andn2_b32 s1, 15, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_and_b32 s0, s2, s3 +; GFX6-NEXT: s_and_b32 s0, s1, s4 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -570,8 +570,9 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8) - ; CHECK: $vgpr0 = COPY [[SEXT]](s32) + ; CHECK: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD]](s8) + ; CHECK: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[SEXT]](s16) + ; CHECK: $vgpr0 = COPY [[SEXT1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -627,8 +628,9 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8) - ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[LOAD]](s8) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT]](s16) + ; CHECK: $vgpr0 = COPY [[ZEXT1]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -258,7 +258,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 24 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_ZEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1) @@ -273,7 +274,8 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32) + ; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 24 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_SEXT]](s32) ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1) @@ -2105,10 +2107,11 @@ ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5) + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD]](s32) ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 ; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 2 from %fixed-stack.2, align 4, addrspace 5) - ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16) ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5) ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 @@ -2120,8 +2123,8 @@ ; CHECK: [[COPY35:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[TRUNC]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[TRUNC]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1) ; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1) ; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -738,14 +738,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16: @@ -776,11 +774,11 @@ ; GFX6-LABEL: v_lshr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16_15: @@ -812,12 +810,12 @@ ; GFX6-LABEL: s_lshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s1, s2, s3 +; GFX6-NEXT: s_lshr_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -867,13 +865,13 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: lshr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -904,13 +902,13 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -429,8 +429,15 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16: @@ -451,8 +458,15 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_commute: @@ -473,8 +487,15 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s1 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_use: @@ -501,9 +522,19 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s1 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s3, s4, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_and_b32 s1, s6, s1 ; GFX6-NEXT: s_or_b32 s1, s3, s1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: @@ -529,12 +560,27 @@ } define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GCN-LABEL: v_orn2_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_orn2_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_orn2_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_orn2_v2i16: ; GFX10: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -218,33 +218,23 @@ ; GFX6-LABEL: v_roundeven_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16: @@ -282,35 +272,33 @@ ; GFX6-LABEL: v_roundeven_v2f16_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16_fneg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16_fneg: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2702,21 +2702,19 @@ ; GFX6-LABEL: v_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 @@ -2724,13 +2722,8 @@ ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: @@ -2775,20 +2768,18 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_min_i32 s7, s0, 0 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s1, s7, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s6 -; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 +; GFX6-NEXT: s_max_i32 s2, s7, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, 0 ; GFX6-NEXT: s_sub_i32 s3, s4, s3 @@ -2863,11 +2854,9 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_min_i32 s5, s0, 0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_sub_i32 s5, s3, s5 ; GFX6-NEXT: s_brev_b32 s2, -2 @@ -2938,11 +2927,9 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: s_brev_b32 s2, -2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -611,9 +611,13 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; GFX7-LABEL: s_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s0, s2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2 ; GFX7-NEXT: ; return to shader part epilog @@ -661,9 +665,13 @@ ; GFX7-LABEL: v_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -731,15 +731,10 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16: @@ -770,11 +765,8 @@ ; GFX6-LABEL: v_shl_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16_15: @@ -806,11 +798,10 @@ ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -856,9 +847,8 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 @@ -894,12 +884,11 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2688,35 +2688,28 @@ ; GFX6-LABEL: v_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: @@ -2761,20 +2754,18 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_min_i32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s1, s6, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s7 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 +; GFX6-NEXT: s_max_i32 s2, s6, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -2849,11 +2840,9 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: s_max_i32 s4, s0, -1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_brev_b32 s3, 1 @@ -2924,11 +2913,9 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: s_brev_b32 s3, 1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -1748,22 +1748,18 @@ ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v4, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: @@ -1795,14 +1791,12 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_not_b32 s4, s0 -; GFX6-NEXT: s_min_u32 s1, s4, s1 -; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 +; GFX6-NEXT: s_min_u32 s2, s4, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_not_b32 s3, s1 ; GFX6-NEXT: s_min_u32 s2, s3, s2 @@ -1847,9 +1841,7 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: uaddsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_not_b32 s2, s0 ; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 @@ -1893,9 +1885,7 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: uaddsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1662,20 +1662,16 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: @@ -1707,13 +1703,11 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_lshr_b32 s3, s1, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_min_u32 s2, s0, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_u32 s1, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_min_u32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1757,8 +1751,6 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: usubsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 @@ -1801,8 +1793,6 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: usubsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -25,6 +25,13 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v2i16_one_use: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_lshl_b32 s1, s3, 16 +; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, s1 ; GFX7-NEXT: s_xor_b32 s0, s0, -1 ; GFX7-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll =================================================================== --- llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -159,14 +159,15 @@ i8 signext %p4, i16 signext %p5) { ; CHECK-LABEL: name: test_stack_args_signext ; CHECK: fixedStack: -; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 -; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4 +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4 ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: [[VREGR1:%[0-9]+]]:_(s32) = COPY $r1 ; CHECK: [[VREGP1:%[0-9]+]]:_(s16) = G_TRUNC [[VREGR1]] ; CHECK: [[FIP5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P5]] ; CHECK: [[VREGP5EXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP5]](p0){{.*}}load 4 -; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[VREGP5EXT]] +; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[VREGP5EXT]], 16 +; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]] ; CHECK: [[SUM:%[0-9]+]]:_(s16) = G_ADD [[VREGP1]], [[VREGP5]] ; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]] ; CHECK: $r0 = COPY [[SUM_EXT]](s32) @@ -180,14 +181,15 @@ i8 zeroext %p4, i16 zeroext %p5) { ; CHECK-LABEL: name: test_stack_args_zeroext ; CHECK: fixedStack: -; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 -; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4 +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4 ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: [[VREGR2:%[0-9]+]]:_(s32) = COPY $r2 ; CHECK: [[VREGP2:%[0-9]+]]:_(s8) = G_TRUNC [[VREGR2]] ; CHECK: [[FIP4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P4]] ; CHECK: [[VREGP4EXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP4]](p0){{.*}}load 4 -; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[VREGP4EXT]] +; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[VREGP4EXT]], 8 +; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]] ; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[VREGP4]] ; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]] ; CHECK: $r0 = COPY [[SUM_EXT]](s32) @@ -201,14 +203,15 @@ i8 %p4, i16 %p5) { ; CHECK-LABEL: name: test_stack_args_noext ; CHECK: fixedStack: -; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 -; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0, size: 4, alignment: 8, +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4, size: 4, alignment: 4, ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: [[VREGR2:%[0-9]+]]:_(s32) = COPY $r2 ; CHECK: [[VREGP2:%[0-9]+]]:_(s8) = G_TRUNC [[VREGR2]] ; CHECK: [[FIP4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P4]] -; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_LOAD [[FIP4]](p0){{.*}}load 1 -; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[VREGP4]] +; CHECK: [[VREGP4:%[0-9]+]]:_(s32) = G_LOAD [[FIP4]](p0){{.*}}load 4 +; CHECK: [[TRUNC_VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[VREGP4]] +; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[TRUNC_VREGP4]] ; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]] ; CHECK: $r0 = COPY [[SUM_EXT]](s32) ; CHECK: BX_RET 14 /* CC::al */, $noreg, implicit $r0 @@ -221,12 +224,13 @@ i8 signext %p4, i16 signext %p5) { ; CHECK-LABEL: name: test_stack_args_extend_the_extended ; CHECK: fixedStack: -; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 -; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4, alignment: 8 +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4, alignment: 4 ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: [[FIP5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P5]] ; CHECK: [[VREGP5SEXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP5]](p0){{.*}}load 4 -; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[VREGP5SEXT]] +; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[VREGP5SEXT]], 16 +; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]] ; CHECK: [[VREGP5ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[VREGP5]] ; CHECK: $r0 = COPY [[VREGP5ZEXT]] ; CHECK: BX_RET 14 /* CC::al */, $noreg, implicit $r0 Index: llvm/test/CodeGen/ARM/GlobalISel/arm-isel.ll =================================================================== --- llvm/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ llvm/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -252,7 +252,7 @@ define i16 @test_stack_args_mixed(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 %p4, i16 %p5) { ; CHECK-LABEL: test_stack_args_mixed: ; CHECK: add [[P5ADDR:r[0-9]+]], sp, #4 -; CHECK: ldrh [[P5:r[0-9]+]], {{.*}}[[P5ADDR]] +; CHECK: ldr [[P5:r[0-9]+]], {{.*}}[[P5ADDR]] ; CHECK: add r0, r1, [[P5]] ; CHECK: bx lr entry: @@ -285,7 +285,7 @@ define i8 @test_stack_args_noext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 %p4) { ; CHECK-LABEL: test_stack_args_noext: ; CHECK: mov [[P4ADDR:r[0-9]+]], sp -; CHECK: ldrb [[P4:r[0-9]+]], {{.*}}[[P4ADDR]] +; CHECK: ldr [[P4:r[0-9]+]], {{.*}}[[P4ADDR]] ; CHECK: add r0, r2, [[P4]] ; CHECK: bx lr entry: Index: llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-vfp4.mir =================================================================== --- llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-vfp4.mir +++ llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-vfp4.mir @@ -96,10 +96,12 @@ ; SOFT-ABI: [[OFF1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; SOFT-ABI: [[FI1:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP1]], [[OFF1]](s32) + ; FIXME: Should avoid multiple copies from $sp ; FIXME: This ought to be align 8 but ARM's call lowering hardcodes it to 1 ; SOFT-ABI: G_STORE [[Y0]](s32), [[FI1]](p0){{.*}}store 4 into stack, align 1) + ; SOFT-ABI: [[SP2:%[0-9]+]]:_(p0) = COPY $sp ; SOFT-ABI: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SOFT-ABI: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FI1]], [[OFF2]](s32) + ; SOFT-ABI: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP2]], [[OFF2]](s32) ; SOFT-ABI: G_STORE [[Y1]](s32), [[FI2]](p0){{.*}}store 4 into stack + 4, align 1) ; SOFT-ABI: BL &fma, {{.*}}, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 ; SOFT-ABI-DAG: [[R0:%[0-9]+]]:_(s32) = COPY $r0 Index: llvm/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll =================================================================== --- llvm/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll +++ llvm/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll @@ -73,35 +73,36 @@ ; CHECK: $r2 = COPY [[SEXTB]] ; CHECK: [[ZEXTB:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]](s16) ; CHECK: $r3 = COPY [[ZEXTB]] +; CHECK: [[SEXTA2:%[0-9]+]]:_(s32) = G_SEXT [[AVREG]] ; CHECK: [[SP1:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFF1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[FI1:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP1]], [[OFF1]](s32) -; CHECK: [[SEXTA2:%[0-9]+]]:_(s32) = G_SEXT [[AVREG]] ; CHECK: G_STORE [[SEXTA2]](s32), [[FI1]](p0){{.*}}store 4 +; CHECK: [[ZEXTA2:%[0-9]+]]:_(s32) = G_ZEXT [[AVREG]] ; CHECK: [[SP2:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP2]], [[OFF2]](s32) -; CHECK: [[ZEXTA2:%[0-9]+]]:_(s32) = G_ZEXT [[AVREG]] ; CHECK: G_STORE [[ZEXTA2]](s32), [[FI2]](p0){{.*}}store 4 +; CHECK: [[SEXTB2:%[0-9]+]]:_(s32) = G_SEXT [[BVREG]] ; CHECK: [[SP3:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFF3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK: [[FI3:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP3]], [[OFF3]](s32) -; CHECK: [[SEXTB2:%[0-9]+]]:_(s32) = G_SEXT [[BVREG]] ; CHECK: G_STORE [[SEXTB2]](s32), [[FI3]](p0){{.*}}store 4 +; CHECK: [[ZEXTB2:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]] ; CHECK: [[SP4:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFF4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK: [[FI4:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP4]], [[OFF4]](s32) -; CHECK: [[ZEXTB2:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]] ; CHECK: G_STORE [[ZEXTB2]](s32), [[FI4]](p0){{.*}}store 4 +; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]] ; CHECK: [[SP5:%[0-9]+]]:_(p0) = COPY $sp ; CHECK: [[OFF5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[FI5:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP5]], [[OFF5]](s32) -; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]] ; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4 ; ARM: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 ; THUMB: tBL 14 /* CC::al */, $noreg, @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 ; CHECK: [[R0VREG:%[0-9]+]]:_(s32) = COPY $r0 -; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG]] +; CHECK: [[R0VREG_ASSERT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[R0VREG]], 16 +; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG_ASSERT]] ; CHECK: ADJCALLSTACKUP 20, 0, 14 /* CC::al */, $noreg, implicit-def $sp, implicit $sp ; CHECK: [[RExtVREG:%[0-9]+]]:_(s32) = G_SEXT [[RVREG]] ; CHECK: $r0 = COPY [[RExtVREG]] Index: llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll =================================================================== --- llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll +++ llvm/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -49,7 +49,7 @@ } define half @test_half(half %a, half %b) { -; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half) +; CHECK: remark: {{.*}} unable to legalize instruction: %{{[0-9]+}}:_(s16) = G_FADD %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: test_half) ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half %res = fadd half %a, %b ret half %res Index: llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll =================================================================== --- llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll +++ llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -47,8 +47,10 @@ ; ; X86-LABEL: test_add_i16: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: addw {{[0-9]+}}(%esp), %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addw %cx, %ax +; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %ret = add i16 %arg1, %arg2 ret i16 %ret @@ -65,8 +67,10 @@ ; ; X86-LABEL: test_add_i8: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: addb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addb %cl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl %ret = add i8 %arg1, %arg2 ret i8 %ret Index: llvm/test/CodeGen/X86/GlobalISel/callingconv.ll =================================================================== --- llvm/test/CodeGen/X86/GlobalISel/callingconv.ll +++ llvm/test/CodeGen/X86/GlobalISel/callingconv.ll @@ -32,7 +32,8 @@ define i8 @test_arg_i8(i8 %a) { ; X32-LABEL: test_arg_i8: ; X32: # %bb.0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; ; X64-LABEL: test_arg_i8: @@ -46,7 +47,8 @@ define i16 @test_arg_i16(i16 %a) { ; X32-LABEL: test_arg_i16: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: retl ; ; X64-LABEL: test_arg_i16: Index: llvm/test/CodeGen/X86/GlobalISel/ext.ll =================================================================== --- llvm/test/CodeGen/X86/GlobalISel/ext.ll +++ llvm/test/CodeGen/X86/GlobalISel/ext.ll @@ -1,3 +1,5 @@ +; XFAIL: * +; FIXME: This test is broken due to https://bugs.llvm.org/show_bug.cgi?id=50035 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64 ; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32 Index: llvm/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll =================================================================== --- llvm/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll +++ llvm/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll @@ -10,28 +10,36 @@ ; X86-LABEL: name: test_i8_args_8 ; X86: bb.1.entry: ; X86: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.7 - ; X86: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.7, align 16) + ; X86: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.7, align 16) + ; X86: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32) ; X86: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.6 - ; X86: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.6, align 4) + ; X86: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.6, align 4) + ; X86: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32) ; X86: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.5 - ; X86: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX2]](p0) :: (invariant load 1 from %fixed-stack.5, align 8) + ; X86: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (invariant load 1 from %fixed-stack.5, align 8) + ; X86: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s32) ; X86: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.4 - ; X86: [[LOAD3:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX3]](p0) :: (invariant load 1 from %fixed-stack.4, align 4) + ; X86: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (invariant load 1 from %fixed-stack.4, align 4) + ; X86: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD3]](s32) ; X86: [[FRAME_INDEX4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3 - ; X86: [[LOAD4:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX4]](p0) :: (invariant load 1 from %fixed-stack.3, align 16) + ; X86: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p0) :: (invariant load 1 from %fixed-stack.3, align 16) + ; X86: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD4]](s32) ; X86: [[FRAME_INDEX5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2 - ; X86: [[LOAD5:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX5]](p0) :: (invariant load 1 from %fixed-stack.2, align 4) + ; X86: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p0) :: (invariant load 1 from %fixed-stack.2, align 4) + ; X86: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD5]](s32) ; X86: [[FRAME_INDEX6:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1 - ; X86: [[LOAD6:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX6]](p0) :: (invariant load 1 from %fixed-stack.1, align 8) + ; X86: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p0) :: (invariant load 1 from %fixed-stack.1, align 8) + ; X86: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD6]](s32) ; X86: [[FRAME_INDEX7:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0 - ; X86: [[LOAD7:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX7]](p0) :: (invariant load 1 from %fixed-stack.0, align 4) + ; X86: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p0) :: (invariant load 1 from %fixed-stack.0, align 4) + ; X86: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD7]](s32) ; X86: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_8bit ; X86: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_8bit ; X86: [[GV2:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_8bit - ; X86: G_STORE [[LOAD]](s8), [[GV]](p0) :: (store 1 into @a1_8bit) - ; X86: G_STORE [[LOAD6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit) - ; X86: G_STORE [[LOAD7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit) - ; X86: $al = COPY [[LOAD]](s8) + ; X86: G_STORE [[TRUNC]](s8), [[GV]](p0) :: (store 1 into @a1_8bit) + ; X86: G_STORE [[TRUNC6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit) + ; X86: G_STORE [[TRUNC7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit) + ; X86: $al = COPY [[TRUNC]](s8) ; X86: RET 0, implicit $al ; X64-LABEL: name: test_i8_args_8 ; X64: bb.1.entry: @@ -49,15 +57,17 @@ ; X64: [[COPY5:%[0-9]+]]:_(s32) = COPY $r9d ; X64: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[COPY5]](s32) ; X64: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1 - ; X64: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.1, align 16) + ; X64: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.1, align 16) + ; X64: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32) ; X64: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0 - ; X64: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.0, align 8) + ; X64: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.0, align 8) + ; X64: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32) ; X64: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_8bit ; X64: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_8bit ; X64: [[GV2:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_8bit ; X64: G_STORE [[TRUNC]](s8), [[GV]](p0) :: (store 1 into @a1_8bit) - ; X64: G_STORE [[LOAD]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit) - ; X64: G_STORE [[LOAD1]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit) + ; X64: G_STORE [[TRUNC6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit) + ; X64: G_STORE [[TRUNC7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit) ; X64: $al = COPY [[TRUNC]](s8) ; X64: RET 0, implicit $al entry: Index: llvm/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll =================================================================== --- llvm/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll +++ llvm/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s | FileCheck %s @@ -47,7 +47,7 @@ define i1 * @test_store_i1(i1 %val, i1 * %p1) { ; CHECK-LABEL: test_store_i1: ; CHECK: # %bb.0: -; CHECK-NEXT: movb 4(%esp), %cl +; CHECK-NEXT: movl 4(%esp), %ecx ; CHECK-NEXT: movl 8(%esp), %eax ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: movb %cl, (%eax) @@ -59,7 +59,7 @@ define i8 * @test_store_i8(i8 %val, i8 * %p1) { ; CHECK-LABEL: test_store_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movb 4(%esp), %cl +; CHECK-NEXT: movl 4(%esp), %ecx ; CHECK-NEXT: movl 8(%esp), %eax ; CHECK-NEXT: movb %cl, (%eax) ; CHECK-NEXT: retl @@ -70,7 +70,7 @@ define i16 * @test_store_i16(i16 %val, i16 * %p1) { ; CHECK-LABEL: test_store_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl 4(%esp), %ecx +; CHECK-NEXT: movl 4(%esp), %ecx ; CHECK-NEXT: movl 8(%esp), %eax ; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: retl