diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -68,7 +68,6 @@ FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); -FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); @@ -224,9 +223,6 @@ void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; -void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); -extern char &AMDGPULateCodeGenPrepareID; - void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ /dev/null @@ -1,198 +0,0 @@ -//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass does misc. AMDGPU optimizations on IR *just* before instruction -/// selection. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/InitializePasses.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/KnownBits.h" -#include "llvm/Transforms/Utils/Local.h" -#include -#include - -#define DEBUG_TYPE "amdgpu-late-codegenprepare" - -using namespace llvm; - -// Scalar load widening needs running after load-store-vectorizer as that pass -// doesn't handle overlapping cases. In addition, this pass enhances the -// widening to handle cases where scalar sub-dword loads are naturally aligned -// only but not dword aligned. -static cl::opt - WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", - cl::desc("Widen sub-dword constant address space loads in " - "AMDGPULateCodeGenPrepare"), - cl::ReallyHidden, cl::init(true)); - -namespace { - -class AMDGPULateCodeGenPrepare - : public FunctionPass, - public InstVisitor { - Module *Mod = nullptr; - const DataLayout *DL = nullptr; - - AssumptionCache *AC = nullptr; - LegacyDivergenceAnalysis *DA = nullptr; - -public: - static char ID; - - AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} - - StringRef getPassName() const override { - return "AMDGPU IR late optimizations"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - } - - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - - bool visitInstruction(Instruction &) { return false; } - - // Check if the specified value is at least DWORD aligned. - bool isDWORDAligned(const Value *V) const { - KnownBits Known = computeKnownBits(V, *DL, 0, AC); - return Known.countMinTrailingZeros() >= 2; - } - - bool canWidenScalarExtLoad(LoadInst &LI) const; - bool visitLoadInst(LoadInst &LI); -}; - -} // end anonymous namespace - -bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { - Mod = &M; - DL = &Mod->getDataLayout(); - return false; -} - -bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - AC = &getAnalysis().getAssumptionCache(F); - DA = &getAnalysis(); - - bool Changed = false; - for (auto &BB : F) - for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { - Instruction *I = &*BI++; - Changed |= visit(*I); - } - - return Changed; -} - -bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { - unsigned AS = LI.getPointerAddressSpace(); - // Skip non-constant address space. - if (AS != AMDGPUAS::CONSTANT_ADDRESS && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return false; - // Skip non-simple loads. - if (!LI.isSimple()) - return false; - auto *Ty = LI.getType(); - // Skip aggregate types. - if (Ty->isAggregateType()) - return false; - unsigned TySize = DL->getTypeStoreSize(Ty); - // Only handle sub-DWORD loads. - if (TySize >= 4) - return false; - // That load must be at least naturally aligned. - if (LI.getAlign() < DL->getABITypeAlign(Ty)) - return false; - // It should be uniform, i.e. a scalar load. - return DA->isUniform(&LI); -} - -bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { - if (!WidenLoads) - return false; - - // Skip if that load is already aligned on DWORD at least as it's handled in - // SDAG. - if (LI.getAlign() >= 4) - return false; - - if (!canWidenScalarExtLoad(LI)) - return false; - - int64_t Offset = 0; - auto *Base = - GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); - // If that base is not DWORD aligned, it's not safe to perform the following - // transforms. - if (!isDWORDAligned(Base)) - return false; - - int64_t Adjust = Offset & 0x3; - if (Adjust == 0) { - // With a zero adjust, the original alignment could be promoted with a - // better one. - LI.setAlignment(Align(4)); - return true; - } - - IRBuilder<> IRB(&LI); - IRB.SetCurrentDebugLocation(LI.getDebugLoc()); - - unsigned AS = LI.getPointerAddressSpace(); - unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; - auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); - - PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); - PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); - auto *NewPtr = IRB.CreateBitCast( - IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), - Offset - Adjust), - Int32PtrTy); - LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); - NewLd->copyMetadata(LI); - NewLd->setMetadata(LLVMContext::MD_range, nullptr); - - unsigned ShAmt = Adjust * 8; - auto *NewVal = IRB.CreateBitCast( - IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); - LI.replaceAllUsesWith(NewVal); - RecursivelyDeleteTriviallyDeadInstructions(&LI); - - return true; -} - -INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, - "AMDGPU IR late optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, - "AMDGPU IR late optimizations", false, false) - -char AMDGPULateCodeGenPrepare::ID = 0; - -FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { - return new AMDGPULateCodeGenPrepare(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -236,7 +236,6 @@ initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); - initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -885,7 +884,6 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - addPass(createAMDGPULateCodeGenPreparePass()); if (EnableAtomicOptimizations) { addPass(createAMDGPUAtomicOptimizerPass()); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -60,7 +60,6 @@ AMDGPUISelDAGToDAG.cpp AMDGPUISelLowering.cpp AMDGPUGlobalISelUtils.cpp - AMDGPULateCodeGenPrepare.cpp AMDGPULegalizerInfo.cpp AMDGPULibCalls.cpp AMDGPULibFunc.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -102,6 +103,11 @@ cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); +static cl::opt WidenUnalignedLoads( + "amdgpu-widen-unaligned-constant-loads", + cl::desc("Widen sub-dword constant address space unaligned loads"), + cl::ReallyHidden, cl::init(true)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -7933,9 +7939,11 @@ llvm_unreachable("invalid ext type"); } -SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { +SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - if (Ld->getAlignment() < 4 || Ld->isDivergent()) + + if (Ld->isDivergent()) return SDValue(); // FIXME: Constant loads should all be marked invariant. @@ -7960,14 +7968,46 @@ // TODO: Drop only high part of range. SDValue Ptr = Ld->getBasePtr(); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - MVT::i32, SL, Ld->getChain(), Ptr, - Ld->getOffset(), - Ld->getPointerInfo(), MVT::i32, - Ld->getAlignment(), - Ld->getMemOperand()->getFlags(), - Ld->getAAInfo(), - nullptr); // Drop ranges + Align Alignment = Ld->getAlign(); + MachinePointerInfo PtrInfo = Ld->getPointerInfo(); + + int Adjust = 0; + if (Alignment < 4) { + if (!WidenUnalignedLoads) + return SDValue(); + // For non-DWORD aligned loads, check whether their pointers are derived + // from a DWORD-aligned base pointer plus a constant offset. + const Value *P = PtrInfo.V.dyn_cast(); + // Skip pseudo source value. + if (!P) + return SDValue(); + int64_t Offset = 0; + const Value *Base = + GetPointerBaseWithConstantOffset(P, Offset, DAG.getDataLayout()); + KnownBits Known = computeKnownBits(Base, DAG.getDataLayout()); + // Skip if that base pointer is not DWORD-aligned. + if (Known.countMinTrailingZeros() < 2) + return SDValue(); + // Count the offset from the original pointer info. That's usually + // contributed from the legalizer. + Offset += PtrInfo.Offset; + // Adjust the pointer to the DWORD containing the original sub-DWORD value. + Adjust = Offset & 0x3; + // Prepare the new pointer info. + PtrInfo.V = Base; + PtrInfo.Offset = Offset - Adjust; + // Caculate the new pointer. + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(-Adjust), SL); + DCI.AddToWorklist(Ptr.getNode()); + // Now, the original load is widened into a DWORD-aligned one. + Alignment = Align(4); + } + + SDValue NewLoad = + DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, + Ld->getChain(), Ptr, Ld->getOffset(), PtrInfo, MVT::i32, + Alignment, Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), + nullptr); // Drop ranges EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); if (MemVT.isFloatingPoint()) { @@ -7977,12 +8017,19 @@ } SDValue Cvt = NewLoad; + if (Adjust) { + // If Ptr is adjusted for alignment, the loaded value needs shifting right. + unsigned Opcode = + (Ld->getExtensionType() == ISD::SEXTLOAD) ? ISD::SRA : ISD::SRL; + Cvt = DAG.getNode(Opcode, SL, MVT::i32, Cvt, + DAG.getConstant(Adjust * 8, SL, MVT::i32)); + } if (Ld->getExtensionType() == ISD::SEXTLOAD) { - Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, + Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, Cvt, DAG.getValueType(TruncVT)); } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || Ld->getExtensionType() == ISD::NON_EXTLOAD) { - Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + Cvt = DAG.getZeroExtendInReg(Cvt, SL, TruncVT); } else { assert(Ld->getExtensionType() == ISD::EXTLOAD); } diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -903,12 +903,16 @@ ret void } -; FIXME: Why not all scalar loads? ; GCN-LABEL: {{^}}array_3xi16: -; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2 -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4 -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6 +; HSA-GFX9: s_load_dword [[S0:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x0 +; HSA-GFX9: s_load_dword [[S1:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 +; HSA-GFX9: v_mov_b32_e32 [[V0:v[0-9]+]], [[S0]] +; HSA-GFX9: v_mov_b32_e32 [[V1:v[0-9]+]], [[S1]] +; HSA-GFX9-DAG: global_store_byte v[{{[0-9]+:[0-9]+}}], [[V0]], off +; HSA-GFX9-DAG: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], [[V0]], off +; HSA-GFX9-DAG: global_store_short v[{{[0-9]+:[0-9]+}}], [[V1]], off +; HSA-GFX9-DAG: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], [[V1]], off +; HSA-GFX9: s_endpgm define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef @@ -916,8 +920,9 @@ } ; GCN-LABEL: {{^}}small_array_round_down_offset: -; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1 +; HSA-GFX9: s_load_dword [[V0:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x0 +; HSA-GFX9: s_lshr_b32 [[V1:s[0-9]+]], [[V0]], 8 +; HSA-GFX9: v_mov_b32_e32 v{{[0-9]+}}, [[V1]] define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { %val = extractvalue [1 x i8] %arg, 0 store volatile i8 %val, i8 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -30,67 +30,58 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: s_bfe_u32 s3, s2, 0x70010 +; HAWAII-NEXT: v_mov_b32_e32 v1, s3 +; HAWAII-NEXT: ds_write_b8 v0, v1 offset:6 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b32 v0, v1 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 ; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 ; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_and_b32 s4, s2, 0xffff0000 +; FIJI-NEXT: s_or_b32 s0, s4, s3 +; FIJI-NEXT: s_bfe_u32 s0, s0, 0x70010 +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: ds_write_b8 v0, v1 offset:6 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b32 v0, v1 ; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff0000 +; GFX9-NEXT: s_or_b32 s0, s4, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 -; GFX9-NEXT: ds_write_b32 v1, v3 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -140,7 +140,6 @@ "AMDGPUInstCombineIntrinsic.cpp", "AMDGPUInstrInfo.cpp", "AMDGPUInstructionSelector.cpp", - "AMDGPULateCodeGenPrepare.cpp", "AMDGPULegalizerInfo.cpp", "AMDGPULibCalls.cpp", "AMDGPULibFunc.cpp",