Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -782,9 +782,10 @@ /// copy/move/set is converted to a sequence of store operations. Its use /// helps to ensure that such replacements don't generate code that causes an /// alignment error (trap) on the target machine. - virtual bool allowsUnalignedMemoryAccesses(EVT, - unsigned AddrSpace = 0, - bool * /*Fast*/ = nullptr) const { + virtual bool allowsMisalignedMemoryAccesses(EVT, + unsigned AddrSpace = 0, + unsigned Align = 1, + bool * /*Fast*/ = nullptr) const { return false; } Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -724,10 +724,11 @@ // If this is an unaligned store and the target doesn't support it, // expand it. unsigned AS = ST->getAddressSpace(); - if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) { + unsigned Align = ST->getAlignment(); + if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) { Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty); - if (ST->getAlignment() < ABIAlignment) + if (Align < ABIAlignment) ExpandUnalignedStore(cast(Node), DAG, TLI, this); } @@ -835,12 +836,13 @@ default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Legal: { unsigned AS = ST->getAddressSpace(); + unsigned Align = ST->getAlignment(); // If this is an unaligned store and the target doesn't support it, // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) { + if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) { Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty); - if (ST->getAlignment() < ABIAlignment) + if (Align < ABIAlignment) ExpandUnalignedStore(cast(Node), DAG, TLI, this); } break; @@ -886,13 +888,14 @@ default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Legal: { unsigned AS = LD->getAddressSpace(); + unsigned Align = LD->getAlignment(); // If this is an unaligned load and the target doesn't support it, // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT(), AS)) { + if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) { Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty); - if (LD->getAlignment() < ABIAlignment){ + if (Align < ABIAlignment){ ExpandUnalignedLoad(cast(Node), DAG, TLI, RVal, RChain); } } @@ -1077,12 +1080,13 @@ // it, expand it. EVT MemVT = LD->getMemoryVT(); unsigned AS = LD->getAddressSpace(); - if (!TLI.allowsUnalignedMemoryAccesses(MemVT, AS)) { + unsigned Align = LD->getAlignment(); + if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) { Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty); - if (LD->getAlignment() < ABIAlignment){ + if (Align < ABIAlignment){ ExpandUnalignedLoad(cast(Node), DAG, TLI, Value, Chain); } Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3810,7 +3810,7 @@ if (VT == MVT::Other) { unsigned AS = 0; if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment(AS) || - TLI.allowsUnalignedMemoryAccesses(VT, AS)) { + TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) { VT = TLI.getPointerTy(); } else { switch (DstAlign & 7) { @@ -3870,7 +3870,7 @@ unsigned AS = 0; if (NumMemOps && AllowOverlap && VTSize >= 8 && NewVTSize < Size && - TLI.allowsUnalignedMemoryAccesses(VT, AS, &Fast) && Fast) + TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast) VTSize = Size; else { VT = NewVT; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5725,9 +5725,10 @@ unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); // TODO: Handle 5 byte compare as 4-byte + 1 byte. // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. + // TODO: Check alignment of src and dest ptrs. if (!TLI->isTypeLegal(LoadVT) || - !TLI->allowsUnalignedMemoryAccesses(LoadVT, SrcAS) || - !TLI->allowsUnalignedMemoryAccesses(LoadVT, DstAS)) + !TLI->allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || + !TLI->allowsMisalignedMemoryAccesses(LoadVT, DstAS)) ActuallyDoIt = false; } Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -212,10 +212,11 @@ MVT getScalarShiftAmountTy(EVT LHSTy) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, - bool *Fast = nullptr) const override { + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, + unsigned Align = 1, + bool *Fast = nullptr) const override { if (RequireStrictAlign) return false; // FIXME: True for Cyclone, but not necessary others. Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6227,7 +6227,7 @@ !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast))) + (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; return Size >= 8 ? MVT::i64 : MVT::i32; Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -266,11 +266,12 @@ bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses of the specified type. Returns whether it /// is "fast" by reference in the second argument. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace, - bool *Fast) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, + unsigned Align, + bool *Fast) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -9696,8 +9696,10 @@ return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned, - bool *Fast) const { +bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); @@ -9751,11 +9753,12 @@ bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) { + (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) { + (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && + Fast))) { return MVT::f64; } } Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -188,7 +188,7 @@ /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory /// accesses for some types. For details, see - /// ARMTargetLowering::allowsUnalignedMemoryAccesses(). + /// ARMTargetLowering::allowsMisalignedMemoryAccesses(). bool AllowsUnalignedMem; /// RestrictIT - If true, the subtarget disallows generation of deprecated IT Index: lib/Target/Mips/Mips16ISelLowering.h =================================================================== --- lib/Target/Mips/Mips16ISelLowering.h +++ lib/Target/Mips/Mips16ISelLowering.h @@ -22,8 +22,9 @@ explicit Mips16TargetLowering(MipsTargetMachine &TM, const MipsSubtarget &STI); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace, - bool *Fast) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, + unsigned Align, + bool *Fast) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, Index: lib/Target/Mips/Mips16ISelLowering.cpp =================================================================== --- lib/Target/Mips/Mips16ISelLowering.cpp +++ lib/Target/Mips/Mips16ISelLowering.cpp @@ -157,9 +157,10 @@ } bool -Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { return false; } Index: lib/Target/Mips/MipsSEISelLowering.h =================================================================== --- lib/Target/Mips/MipsSEISelLowering.h +++ lib/Target/Mips/MipsSEISelLowering.h @@ -31,8 +31,9 @@ void addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0, + unsigned Align = 1, + bool *Fast = nullptr) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; Index: lib/Target/Mips/MipsSEISelLowering.cpp =================================================================== --- lib/Target/Mips/MipsSEISelLowering.cpp +++ lib/Target/Mips/MipsSEISelLowering.cpp @@ -329,9 +329,10 @@ } bool -MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy; if (Subtarget.systemSupportsUnalignedAccess()) { Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -494,9 +494,10 @@ /// Is unaligned memory access allowed for the given type, and is it fast /// relative to software emulation. - bool allowsUnalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - bool *Fast = nullptr) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align = 1, + bool *Fast = nullptr) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -9214,9 +9214,10 @@ return isInt<16>(Imm) || isUInt<16>(Imm); } -bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (DisablePPCUnaligned) return false; Index: lib/Target/R600/SIISelLowering.h =================================================================== --- lib/Target/R600/SIISelLowering.h +++ lib/Target/R600/SIISelLowering.h @@ -59,8 +59,9 @@ public: SITargetLowering(TargetMachine &tm); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, - bool *IsFast) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, Index: lib/Target/R600/SIISelLowering.cpp =================================================================== --- lib/Target/R600/SIISelLowering.cpp +++ lib/Target/R600/SIISelLowering.cpp @@ -240,15 +240,13 @@ // TargetLowering queries //===----------------------------------------------------------------------===// -bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { if (IsFast) *IsFast = false; - // XXX: This depends on the address space and also we may want to revist - // the alignment values we specify in the DataLayout. - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. if (!VT.isSimple() || VT == MVT::Other) @@ -261,8 +259,12 @@ // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) - return false; + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + return Align % 4 == 0; + } // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -208,8 +208,9 @@ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, - bool *Fast) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *Fast) const override; bool isTruncateFree(Type *, Type *) const override; bool isTruncateFree(EVT, EVT) const override; const char *getTargetNodeName(unsigned Opcode) const override; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -339,9 +339,10 @@ return Imm.isZero() || Imm.isNegZero(); } -bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { // Unaligned accesses should never be slower than the expanded version. // We check specifically for aligned accesses in the few cases where // they are required. Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -565,10 +565,10 @@ /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. Returns whether it /// is "fast" by reference in the second argument. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; /// LowerOperation - Provide custom lowering hooks for some operations. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1775,9 +1775,10 @@ } bool -X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; Index: lib/Target/XCore/XCoreISelLowering.cpp =================================================================== --- lib/Target/XCore/XCoreISelLowering.cpp +++ lib/Target/XCore/XCoreISelLowering.cpp @@ -426,7 +426,9 @@ assert(LD->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension type"); assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT"); - if (allowsUnalignedMemoryAccesses(LD->getMemoryVT())) + if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(), + LD->getAddressSpace(), + LD->getAlignment())) return SDValue(); unsigned ABIAlignment = getDataLayout()-> @@ -504,7 +506,9 @@ StoreSDNode *ST = cast(Op); assert(!ST->isTruncatingStore() && "Unexpected store type"); assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT"); - if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) { + if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(), + ST->getAddressSpace(), + ST->getAlignment())) { return SDValue(); } unsigned ABIAlignment = getDataLayout()-> @@ -1803,7 +1807,9 @@ // Replace unaligned store of unaligned load with memmove. StoreSDNode *ST = cast(N); if (!DCI.isBeforeLegalize() || - allowsUnalignedMemoryAccesses(ST->getMemoryVT()) || + allowsMisalignedMemoryAccesses(ST->getMemoryVT(), + ST->getAddressSpace(), + ST->getAlignment()) || ST->isVolatile() || ST->isIndexed()) { break; } Index: test/CodeGen/R600/unaligned-load-store.ll =================================================================== --- test/CodeGen/R600/unaligned-load-store.ll +++ test/CodeGen/R600/unaligned-load-store.ll @@ -31,3 +31,20 @@ store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 ret void } + +; FIXME: This should use ds_read2_b32 +; SI-LABEL: @load_lds_i64_align_4 +; SI: DS_READ_B64 +; SI: S_ENDPGM +define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %val = load i64 addrspace(3)* %in, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Need to fix this case. +; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +; %val = load i64 addrspace(3)* %in, align 1 +; store i64 %val, i64 addrspace(1)* %out, align 8 +; ret void +; }