Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -66,7 +66,7 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; protected: - bool shouldCombineMemoryType(const MemSDNode *M) const; + bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2182,14 +2182,11 @@ return false; } -bool AMDGPUTargetLowering::shouldCombineMemoryType(const MemSDNode *M) const { - EVT VT = M->getMemoryVT(); - +bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { // i32 vectors are the canonical memory type. if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) return false; - if (!VT.isByteSized()) return false; @@ -2201,15 +2198,6 @@ if (Size == 3 || (Size > 4 && (Size % 4 != 0))) return false; - unsigned Align = M->getAlignment(); - if (Align < Size) { - bool IsFast; - if (!allowsMisalignedMemoryAccesses(VT, M->getAddressSpace(), Align, &IsFast) || - !IsFast) { - return false; - } - } - return true; } @@ -2224,12 +2212,32 @@ if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) return SDValue(); - if (!shouldCombineMemoryType(LN)) - return SDValue(); - SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; EVT VT = LN->getMemoryVT(); + + if (!shouldCombineMemoryType(VT)) + return SDValue(); + + unsigned Size = VT.getStoreSize(); + unsigned Align = LN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = LN->getAddressSpace(); + + // Expand unaligned loads earlier than legalization. Due to visitation order + // problems during legalization, the emitted instructions to pack and unpack + // the bytes again are not eliminated in the case of an unaligned copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return DAG.getMergeValues(Ops, SDLoc(N)); + } + + if (!IsFast) + return SDValue(); + } + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); SDValue NewLoad @@ -2252,15 +2260,35 @@ if (SN->isVolatile() || !ISD::isNormalStore(SN)) return SDValue(); - if (!shouldCombineMemoryType(SN)) + EVT VT = SN->getMemoryVT(); + + if (!shouldCombineMemoryType(VT)) return SDValue(); - SDValue Val = SN->getValue(); - EVT VT = SN->getMemoryVT(); + unsigned Size = VT.getStoreSize(); SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; + unsigned Align = SN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = SN->getAddressSpace(); + + // Expand unaligned stores earlier than legalization. Due to visitation + // order problems during legalization, the emitted instructions to pack and + // unpack the bytes again are not eliminated in the case of an unaligned + // copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + return expandUnalignedStore(SN, DAG); + + if (!IsFast) + return SDValue(); + } + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + SDValue Val = SN->getValue(); + + //DCI.AddToWorklist(Val.getNode()); bool OtherUses = !Val.hasOneUse(); SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -59,20 +59,20 @@ ; This should not be adding instructions to shift into the correct ; position in the word for the component. +; FIXME: Packing bytes ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; SI-NOT: v_lshlrev_b32 -; SI-NOT: v_or_b32 +; SI-DAG: v_lshlrev_b32 +; SI-DAG: v_or_b32 +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] - -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI: buffer_store_dwordx4 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> Index: test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s +; +; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] + +; Works with the align 2 removed +define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b + %x = shl <2 x i32> %c, + %y = ashr <2 x i32> %x, + store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + ret void +} Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -268,7 +268,7 @@ %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, - store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + store <2 x i32> %y, <2 x i32> addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -15,7 +15,7 @@ ret void } -; FUNC-LABEL: {{^}}unaligned_load_store_i16_global: +; FUNC-LABEL: {{^}}global_unaligned_load_store_i16: ; GCN-NOHSA: buffer_load_ubyte ; GCN-NOHSA: buffer_load_ubyte ; GCN-NOHSA: buffer_store_byte @@ -25,22 +25,25 @@ ; GCN-HSA: flat_load_ubyte ; GCN-HSA: flat_store_byte ; GCN-HSA: flat_store_byte -define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { +define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { %v = load i16, i16 addrspace(1)* %p, align 1 store i16 %v, i16 addrspace(1)* %r, align 1 ret void } ; FUNC-LABEL: {{^}}local_unaligned_load_store_i32: -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_write_b8 -; GCN: ds_write_b8 -; GCN: ds_write_b8 -; GCN: ds_write_b8 -; GCN: s_endpgm + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI-NOT: v_or +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { %v = load i32, i32 addrspace(3)* %p, align 1 store i32 %v, i32 addrspace(3)* %r, align 1 @@ -98,141 +101,149 @@ ret void } -; FIXME: Unnecessary packing and unpacking of bytes. ; FUNC-LABEL: {{^}}local_unaligned_load_store_i64: -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 - -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl -; GCN: ds_write_b8 -; GCN: s_endpgm +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI: s_endpgm define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { %v = load i64, i64 addrspace(3)* %p, align 1 store i64 %v, i64 addrspace(3)* %r, align 1 ret void } -; FUNC-LABEL: {{^}}local_unaligned_load_store_v2i32: -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 - -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl - -; GCN: ds_write_b8 -; XGCN-NOT: v_or_b32 -; XGCN-NOT: v_lshl -; GCN: ds_write_b8 -; GCN: s_endpgm +; SI-LABEL: {{^}}local_unaligned_load_store_v2i32: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl + +; SI: ds_write_b8 +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl +; SI: ds_write_b8 +; SI: s_endpgm define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) { %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 ret void } -; FUNC-LABEL: {{^}}unaligned_load_store_i64_global: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte +; SI-LABEL: {{^}}global_align2_load_store_i64: +; SI: buffer_load_ushort +; SI: buffer_load_ushort -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; SI-NOT: v_or_ +; SI-NOT: v_lshl -; XGCN-NOT: v_or_ -; XGCN-NOT: v_lshl +; SI: buffer_load_ushort -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte +; SI-NOT: v_or_ +; SI-NOT: v_lshl -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { +; SI: buffer_load_ushort + +; SI-NOT: v_or_ +; SI-NOT: v_lshl + +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { + %v = load i64, i64 addrspace(1)* %p, align 2 + store i64 %v, i64 addrspace(1)* %r, align 2 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i64_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI-NOT: v_or_ +; SI-NOT: v_lshl + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { %v = load i64, i64 addrspace(1)* %p, align 1 store i64 %v, i64 addrspace(1)* %r, align 1 ret void @@ -285,76 +296,41 @@ ret void } -; FUNC-LABEL: {{^}}global_unaligned_load_store_v4i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte - -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte -; GCN-NOHSA: buffer_store_byte - - -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte - -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -; GCN-HSA: flat_store_byte -define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { +; SI-LABEL: {{^}}global_unaligned_load_store_v4i32 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 ret void