Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2284,7 +2284,8 @@ } if (bestWidth) { EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth); - if (newVT.isRound()) { + if (newVT.isRound() && + shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) { EVT PtrType = Lod->getOperand(1).getValueType(); SDValue Ptr = Lod->getBasePtr(); if (bestOffset != 0) Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -667,6 +667,16 @@ EVT OldVT = N->getValueType(0); unsigned OldSize = OldVT.getStoreSizeInBits(); + MemSDNode *MN = cast(N); + unsigned AS = MN->getAddressSpace(); + // Do not shrink an aligned scalar load to sub-dword. + // Scalar engine cannot do sub-dword loads. + if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 && + AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()) && + (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)) + return false; + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar // extloads, so doing one requires using a buffer_load. In cases where we // still couldn't use a scalar load, using the wider load shouldn't really Index: test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 +define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { + %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x + %load = load i32, i32 addrspace(4)* %ptr, align 4 + %and = and i32 %load, 524288 + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 0, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 +define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { + %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x + %load = load i32, i32 addrspace(4)* %ptr, align 4 + %and = and i32 %load, 8 + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 0, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: +; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 +define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %and = and i32 %load, 524288 + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 0, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: +; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 +define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %and = and i32 %load, 8 + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 0, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: const_load_shrink_dword_to_unaligned_byte: +; GCN: global_load_ushort +define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { + %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x + %load = load i32, i32 addrspace(4)* %ptr, align 2 + %and = and i32 %load, 524288 + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 0, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +}