Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2187,7 +2187,7 @@ return SDValue(); EVT VT = LN->getValueType(0); - if (isTypeLegal(VT) || !VT.isByteSized() || VT.getScalarType() == MVT::i32) + if (!VT.isByteSized()) return SDValue(); // TODO: Is Size == 2 also preferable? @@ -2195,17 +2195,29 @@ if (Size < 4 || (Size > 4 && Size % 4 != 0)) return SDValue(); + SelectionDAG &DAG = DCI.DAG; unsigned Align = LN->getAlignment(); - if (Align < Size) { + if (Align < Size && isTypeLegal(VT)) { bool IsFast; - if (!allowsMisalignedMemoryAccesses(VT, LN->getAddressSpace(), Align, &IsFast) || - !IsFast) { - return SDValue(); + unsigned AS = LN->getAddressSpace(); + + // Expand unaligned loads earlier than legalization. Due to visitation order + // problems during legalization, the emitted instructions to pack and unpack + // the bytes again are not eliminated in the case of an unaligned copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return DAG.getMergeValues(Ops, SDLoc(N)); } + + if (!IsFast) + return SDValue(); } + if (isTypeLegal(VT) || VT.getScalarType() == MVT::i32) + return SDValue(); + SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); SDValue NewLoad @@ -2230,26 +2242,36 @@ SDValue Val = SN->getValue(); EVT VT = Val.getValueType(); - if (isTypeLegal(VT) || !VT.isByteSized()) + if (!VT.isByteSized()) return SDValue(); unsigned Size = VT.getStoreSize(); if (Size < 4 || (Size > 4 && Size % 4 != 0)) return SDValue(); + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; unsigned Align = SN->getAlignment(); - if (Align < Size) { + if (Align < Size && isTypeLegal(VT)) { bool IsFast; - if (!allowsMisalignedMemoryAccesses(VT, SN->getAddressSpace(), Align, &IsFast) || - !IsFast) { + unsigned AS = SN->getAddressSpace(); + + // Expand unaligned stores earlier than legalization. Due to visitation + // order problems during legalization, the emitted instructions to pack and + // unpack the bytes again are not eliminated in the case of an unaligned + // copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + return expandUnalignedStore(SN, DAG); + + if (!IsFast) return SDValue(); - } } - SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; - EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + if (isTypeLegal(VT) || VT.getScalarType() == MVT::i32) + return SDValue(); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + //DCI.AddToWorklist(Val.getNode()); bool OtherUses = !Val.hasOneUse(); SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); if (OtherUses) { Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -61,20 +61,20 @@ ; This should not be adding instructions to shift into the correct ; position in the word for the component. +; FIXME: Packing bytes ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; SI-NOT: v_lshlrev_b32 -; SI-NOT: v_or_b32 +; SI-DAG: v_lshlrev_b32 +; SI-DAG: v_or_b32 +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] - -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI: buffer_store_dwordx4 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> Index: test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll @@ -0,0 +1,28 @@ +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s +; +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: +; SI-NOT: s_lshl +; SI-NOT: s_ashr +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI: s_endpgm + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] + +; Works with the align 2 removed +define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b + %x = shl <2 x i32> %c, + %y = ashr <2 x i32> %x, + store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + ret void +} + Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -268,7 +268,7 @@ %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, - store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + store <2 x i32> %y, <2 x i32> addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -30,6 +30,8 @@ ; SI: ds_read_u8 ; SI: ds_read_u8 ; SI: ds_read_u8 +; SI-NOT: v_or +; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: ds_write_b8 ; SI: ds_write_b8 @@ -78,7 +80,6 @@ ret void } -; FIXME: Unnecessary packing and unpacking of bytes. ; SI-LABEL: {{^}}unaligned_load_store_i64_local: ; SI: ds_read_u8 ; SI: ds_read_u8 @@ -89,35 +90,35 @@ ; SI: ds_read_u8 ; SI: ds_read_u8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { @@ -136,35 +137,35 @@ ; SI: ds_read_u8 ; SI: ds_read_u8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 -; XSI-NOT: v_or_b32 -; XSI-NOT: v_lshl +; SI-NOT: v_or_b32 +; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm define void @unaligned_load_store_v2i32_local(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) { @@ -173,6 +174,33 @@ ret void } +; SI-LABEL: {{^}}align2_load_store_i64_global: +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; SI-NOT: v_or_ +; SI-NOT: v_lshl + +; SI: buffer_load_ushort + +; SI-NOT: v_or_ +; SI-NOT: v_lshl + +; SI: buffer_load_ushort + +; SI-NOT: v_or_ +; SI-NOT: v_lshl + +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +define void @align2_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { + %v = load i64, i64 addrspace(1)* %p, align 2 + store i64 %v, i64 addrspace(1)* %r, align 2 + ret void +} + ; SI-LABEL: {{^}}unaligned_load_store_i64_global: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte @@ -183,8 +211,8 @@ ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte -; XSI-NOT: v_or_ -; XSI-NOT: v_lshl +; SI-NOT: v_or_ +; SI-NOT: v_lshl ; SI: buffer_store_byte ; SI: buffer_store_byte @@ -247,24 +275,40 @@ ret void } -; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. -; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte +; SI-LABEL: {{^}}unaligned_load_store_v4i32_global +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1