Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -753,6 +753,16 @@ /// reduce runtime. virtual bool ShouldShrinkFPConstant(EVT) const { return true; } + // Return true if it is profitable to reduce the given load node to a smaller + // type. + // + // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed + virtual bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + return true; + } + /// When splitting a value of the specified type into parts, does the Lo /// or Hi part come first? This usually follows the endianness, except /// for ppcf128, where the Hi part always comes first. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6035,6 +6035,9 @@ LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) return SDValue(); + if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT)) + return SDValue(); + EVT PtrType = N0.getOperand(1).getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -124,6 +124,9 @@ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; bool isLoadBitCastBeneficial(EVT, EVT) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -429,6 +429,29 @@ return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); } +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) Index: test/CodeGen/R600/no-shrink-extloads.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/no-shrink-extloads.ll @@ -0,0 +1,191 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; Make sure we don't turn the 32-bit argument load into a 16-bit +; load. There aren't extending scalar lods, so that would require +; using a buffer_load instruction. + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: +; SI: s_load_dword s +; SI: buffer_store_short v +define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; It should be OK (and probably performance neutral) to reduce this, +; but we don't know if the load is uniform yet. + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: +; SI: buffer_load_dword v +; SI: buffer_store_short v +define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i16 addrspace(1)* %out, i32 %tid + %load = load i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i16 + store i16 %trunc, i16 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid + %load = load i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i1 + store i1 %trunc, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i1 addrspace(1)* %out, i32 %tid + %load = load i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i1 + store i1 %trunc, i1 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %load = load i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %load = load i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; Might as well reduce to 8-bit loads. +; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { + %trunc = trunc i16 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: +; SI: buffer_load_ubyte v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i16 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid + %load = load i16 addrspace(1)* %gep.in + %trunc = trunc i16 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid + %load = load i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid + %load = load i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} Index: test/CodeGen/R600/store.ll =================================================================== --- test/CodeGen/R600/store.ll +++ test/CodeGen/R600/store.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-CHECK -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG-CHECK -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM-CHECK -check-prefix=FUNC %s ;===------------------------------------------------------------------------===; ; Global Address Space @@ -17,16 +17,18 @@ ; i8 store ; EG-CHECK-LABEL: {{^}}store_i8: ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X -; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]] + ; IG 0: Get the byte index and truncate the value -; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y ; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43) + + ; IG 1: Truncate the calculated the shift amount for the mask -; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG-CHECK-NEXT: 3 + ; IG 2: Shift the value and the mask -; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] +; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] ; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] ; EG-CHECK-NEXT: 255 ; IG 3: Initialize the Y and Z channels to zero @@ -46,16 +48,21 @@ ; i16 store ; EG-CHECK-LABEL: {{^}}store_i16: ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X -; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]] + ; IG 0: Get the byte index and truncate the value -; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y + + +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-CHECK-NEXT: 3(4.203895e-45), + +; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y + ; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; IG 1: Truncate the calculated the shift amount for the mask -; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG-CHECK: 3 + ; IG 2: Shift the value and the mask -; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] +; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] ; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] ; EG-CHECK-NEXT: 65535 ; IG 3: Initialize the Y and Z channels to zero