Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3481,6 +3481,7 @@ EVT VT = LD->getValueType(0); EVT LoadedVT = LD->getMemoryVT(); SDLoc dl(LD); + auto &MF = DAG.getMachineFunction(); if (VT.isFloatingPoint() || VT.isVector()) { EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits()); if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) { @@ -3511,7 +3512,7 @@ // Make sure the stack slot is also aligned for the register type. SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT); - + auto FrameIndex = cast(StackBase.getNode())->getIndex(); SmallVector Stores; SDValue StackPtr = StackBase; unsigned Offset = 0; @@ -3530,8 +3531,9 @@ MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(), LD->getAAInfo()); // Follow the load with a store to the stack slot. Remember the store. - Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr, - MachinePointerInfo())); + Stores.push_back(DAG.getStore( + Load.getValue(1), dl, Load, StackPtr, + MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset))); // Increment the pointers. Offset += RegBytes; Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, PtrIncrement); @@ -3550,15 +3552,17 @@ // Follow the load with a store to the stack slot. Remember the store. // On big-endian machines this requires a truncating store to ensure // that the bits end up in the right place. - Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr, - MachinePointerInfo(), MemVT)); + Stores.push_back(DAG.getTruncStore( + Load.getValue(1), dl, Load, StackPtr, + MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), MemVT)); // The order of the stores doesn't matter - say it with a TokenFactor. SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); // Finally, perform the original load only redirected to the stack slot. Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase, - MachinePointerInfo(), LoadedVT); + MachinePointerInfo::getFixedStack(MF, FrameIndex, 0), + LoadedVT); // Callers expect a MERGE_VALUES node. return std::make_pair(Load, TF); @@ -3628,6 +3632,7 @@ SDValue Val = ST->getValue(); EVT VT = Val.getValueType(); int Alignment = ST->getAlignment(); + auto &MF = DAG.getMachineFunction(); SDLoc dl(ST); if (ST->getMemoryVT().isFloatingPoint() || @@ -3662,10 +3667,12 @@ // Make sure the stack slot is also aligned for the register type. SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); // Perform the original store, only redirected to the stack slot. - SDValue Store = DAG.getTruncStore(Chain, dl, Val, StackPtr, - MachinePointerInfo(), StoredVT); + SDValue Store = DAG.getTruncStore( + Chain, dl, Val, StackPtr, + MachinePointerInfo::getFixedStack(MF, FrameIndex, 0), StoredVT); EVT StackPtrVT = StackPtr.getValueType(); @@ -3677,8 +3684,9 @@ // Do all but one copies using the full register width. for (unsigned i = 1; i < NumRegs; i++) { // Load one integer register's worth from the stack slot. - SDValue Load = - DAG.getLoad(RegVT, dl, Store, StackPtr, MachinePointerInfo()); + SDValue Load = DAG.getLoad( + RegVT, dl, Store, StackPtr, + MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset)); // Store it to the final location. Remember the store. Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr, ST->getPointerInfo().getWithOffset(Offset), @@ -3698,8 +3706,9 @@ 8 * (StoredBytes - Offset)); // Load from the stack slot. - SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr, - MachinePointerInfo(), MemVT); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, RegVT, Store, StackPtr, + MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), MemVT); Stores.push_back( DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr, Index: llvm/trunk/test/CodeGen/AMDGPU/load-private-double16-amdgiz.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-private-double16-amdgiz.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-private-double16-amdgiz.ll @@ -0,0 +1,24 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" + +; GCN-LABEL: @test_unaligned_load +; GCN: buffer_load_dword +; GCN-NOT: flat_load_dword +define amdgpu_kernel void @test_unaligned_load(<16 x double> addrspace(1)* %results, i32 %i) { +entry: + %a = inttoptr i32 %i to <16 x double> addrspace(5)* + %v = load <16 x double>, <16 x double> addrspace(5)* %a, align 8 + store <16 x double> %v, <16 x double> addrspace(1)* %results, align 128 + ret void +} + +; GCN-LABEL: @test_unaligned_store +; GCN: buffer_store_dword +; GCN-NOT: flat_store_dword +define amdgpu_kernel void @test_unaligned_store(<16 x double> %v, i32 %i) { +entry: + %a = inttoptr i32 %i to <16 x double> addrspace(5)* + store <16 x double> %v, <16 x double> addrspace(5)* %a, align 8 + ret void +}