Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4240,8 +4240,17 @@ (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { // Handle any gradient or coordinate operands that should not be packed - AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); - PackedAddrs.push_back(AddrReg); + if ((I < Intr->GradientStart) && IsA16 && + (B.getMRI()->getType(AddrReg) == S16)) { + // special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) + .getReg(0)); + } else { + AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); + PackedAddrs.push_back(AddrReg); + } } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6209,10 +6209,6 @@ } } - // Push back extra arguments. - for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) - VAddrs.push_back(Op.getOperand(ArgOffset + I)); - // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6225,6 +6221,19 @@ MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + // Push back extra arguments. + for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { + if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + // special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + SDValue bias = DAG.getBuildVector( + MVT::v2f16, DL, + {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(bias); + } else + VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control // so both gradients and addresses must be 16 bit Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll @@ -6,7 +6,7 @@ ; GFX9-LABEL: name: sample_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -32,7 +32,7 @@ ; GFX10-LABEL: name: sample_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -64,7 +64,7 @@ ; GFX9-LABEL: name: sample_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -91,7 +91,7 @@ ; GFX10-LABEL: name: sample_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -124,7 +124,7 @@ ; GFX9-LABEL: name: sample_3d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -155,7 +155,7 @@ ; GFX10-LABEL: name: sample_3d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -192,7 +192,7 @@ ; GFX9-LABEL: name: sample_cube ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -223,7 +223,7 @@ ; GFX10-LABEL: name: sample_cube ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -260,7 +260,7 @@ ; GFX9-LABEL: name: sample_1darray ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -287,7 +287,7 @@ ; GFX10-LABEL: name: sample_1darray ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -320,7 +320,7 @@ ; GFX9-LABEL: name: sample_2darray ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -351,7 +351,7 @@ ; GFX10-LABEL: name: sample_2darray ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -388,7 +388,7 @@ ; GFX9-LABEL: name: sample_c_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -418,7 +418,7 @@ ; GFX10-LABEL: name: sample_c_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -454,7 +454,7 @@ ; GFX9-LABEL: name: sample_c_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -484,7 +484,7 @@ ; GFX10-LABEL: name: sample_c_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -520,7 +520,7 @@ ; GFX9-LABEL: name: sample_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -547,7 +547,7 @@ ; GFX10-LABEL: name: sample_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -580,7 +580,7 @@ ; GFX9-LABEL: name: sample_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -611,7 +611,7 @@ ; GFX10-LABEL: name: sample_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -648,7 +648,7 @@ ; GFX9-LABEL: name: sample_c_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -678,7 +678,7 @@ ; GFX10-LABEL: name: sample_c_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -714,7 +714,7 @@ ; GFX9-LABEL: name: sample_c_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -747,7 +747,7 @@ ; GFX10-LABEL: name: sample_c_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -781,11 +781,11 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { +define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) { ; GFX9-LABEL: name: sample_b_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -801,10 +801,10 @@ ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -815,7 +815,7 @@ ; GFX10-LABEL: name: sample_b_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -831,10 +831,10 @@ ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) @@ -843,15 +843,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { ; GFX9-LABEL: name: sample_b_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -868,9 +868,10 @@ ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -881,7 +882,7 @@ ; GFX10-LABEL: name: sample_b_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -898,9 +899,10 @@ ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) @@ -909,15 +911,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { +define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) { ; GFX9-LABEL: name: sample_c_b_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -934,11 +936,11 @@ ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -949,7 +951,7 @@ ; GFX10-LABEL: name: sample_c_b_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -966,11 +968,11 @@ ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -978,15 +980,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: name: sample_c_b_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1004,10 +1006,11 @@ ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -1018,7 +1021,7 @@ ; GFX10-LABEL: name: sample_c_b_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1036,10 +1039,11 @@ ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -1047,15 +1051,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { +define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) { ; GFX9-LABEL: name: sample_b_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1072,9 +1076,10 @@ ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -1085,7 +1090,7 @@ ; GFX10-LABEL: name: sample_b_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1102,9 +1107,10 @@ ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) @@ -1113,15 +1119,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: name: sample_b_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1139,11 +1145,11 @@ ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -1154,7 +1160,7 @@ ; GFX10-LABEL: name: sample_b_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1172,11 +1178,11 @@ ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -1184,15 +1190,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { +define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) { ; GFX9-LABEL: name: sample_c_b_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1210,10 +1216,11 @@ ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -1224,7 +1231,7 @@ ; GFX10-LABEL: name: sample_c_b_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1242,10 +1249,11 @@ ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -1253,15 +1261,15 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: name: sample_c_b_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1280,12 +1288,12 @@ ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -1296,7 +1304,7 @@ ; GFX10-LABEL: name: sample_c_b_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1315,12 +1323,12 @@ ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -1328,7 +1336,7 @@ ; GFX10: $vgpr3 = COPY [[UV3]](s32) ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } @@ -1336,7 +1344,7 @@ ; GFX9-LABEL: name: sample_d_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1368,7 +1376,7 @@ ; GFX10-LABEL: name: sample_d_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1405,7 +1413,7 @@ ; GFX9-LABEL: name: sample_d_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1439,7 +1447,7 @@ ; GFX10-LABEL: name: sample_d_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1478,7 +1486,7 @@ ; GFX9-LABEL: name: sample_d_3d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1519,7 +1527,7 @@ ; GFX10-LABEL: name: sample_d_3d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1566,7 +1574,7 @@ ; GFX9-LABEL: name: sample_c_d_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1600,7 +1608,7 @@ ; GFX10-LABEL: name: sample_c_d_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1639,7 +1647,7 @@ ; GFX9-LABEL: name: sample_c_d_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1675,7 +1683,7 @@ ; GFX10-LABEL: name: sample_c_d_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1716,7 +1724,7 @@ ; GFX9-LABEL: name: sample_d_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1749,7 +1757,7 @@ ; GFX10-LABEL: name: sample_d_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1787,7 +1795,7 @@ ; GFX9-LABEL: name: sample_d_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1824,7 +1832,7 @@ ; GFX10-LABEL: name: sample_d_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1866,7 +1874,7 @@ ; GFX9-LABEL: name: sample_c_d_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1901,7 +1909,7 @@ ; GFX10-LABEL: name: sample_c_d_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1941,7 +1949,7 @@ ; GFX9-LABEL: name: sample_c_d_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -1980,7 +1988,7 @@ ; GFX10-LABEL: name: sample_c_d_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2024,7 +2032,7 @@ ; GFX9-LABEL: name: sample_cd_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2056,7 +2064,7 @@ ; GFX10-LABEL: name: sample_cd_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2093,7 +2101,7 @@ ; GFX9-LABEL: name: sample_cd_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2127,7 +2135,7 @@ ; GFX10-LABEL: name: sample_cd_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2166,7 +2174,7 @@ ; GFX9-LABEL: name: sample_c_cd_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2200,7 +2208,7 @@ ; GFX10-LABEL: name: sample_c_cd_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2239,7 +2247,7 @@ ; GFX9-LABEL: name: sample_c_cd_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2275,7 +2283,7 @@ ; GFX10-LABEL: name: sample_c_cd_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2316,7 +2324,7 @@ ; GFX9-LABEL: name: sample_cd_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2349,7 +2357,7 @@ ; GFX10-LABEL: name: sample_cd_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2387,7 +2395,7 @@ ; GFX9-LABEL: name: sample_cd_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2424,7 +2432,7 @@ ; GFX10-LABEL: name: sample_cd_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2466,7 +2474,7 @@ ; GFX9-LABEL: name: sample_c_cd_cl_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2501,7 +2509,7 @@ ; GFX10-LABEL: name: sample_c_cd_cl_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2541,7 +2549,7 @@ ; GFX9-LABEL: name: sample_c_cd_cl_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2580,7 +2588,7 @@ ; GFX10-LABEL: name: sample_c_cd_cl_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2624,7 +2632,7 @@ ; GFX9-LABEL: name: sample_l_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2651,7 +2659,7 @@ ; GFX10-LABEL: name: sample_l_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2684,7 +2692,7 @@ ; GFX9-LABEL: name: sample_l_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2715,7 +2723,7 @@ ; GFX10-LABEL: name: sample_l_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2752,7 +2760,7 @@ ; GFX9-LABEL: name: sample_c_l_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2782,7 +2790,7 @@ ; GFX10-LABEL: name: sample_c_l_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2818,7 +2826,7 @@ ; GFX9-LABEL: name: sample_c_l_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2851,7 +2859,7 @@ ; GFX10-LABEL: name: sample_c_l_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2889,7 +2897,7 @@ ; GFX9-LABEL: name: sample_lz_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2915,7 +2923,7 @@ ; GFX10-LABEL: name: sample_lz_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2947,7 +2955,7 @@ ; GFX9-LABEL: name: sample_lz_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -2974,7 +2982,7 @@ ; GFX10-LABEL: name: sample_lz_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3007,7 +3015,7 @@ ; GFX9-LABEL: name: sample_c_lz_1d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3037,7 +3045,7 @@ ; GFX10-LABEL: name: sample_c_lz_1d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3073,7 +3081,7 @@ ; GFX9-LABEL: name: sample_c_lz_2d ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3103,7 +3111,7 @@ ; GFX10-LABEL: name: sample_c_lz_2d ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3139,7 +3147,7 @@ ; GFX9-LABEL: name: sample_c_d_o_2darray_V1 ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3176,7 +3184,7 @@ ; GFX10-LABEL: name: sample_c_d_o_2darray_V1 ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3219,7 +3227,7 @@ ; GFX9-LABEL: name: sample_c_d_o_2darray_V2 ; GFX9: bb.1.main_body: ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3258,7 +3266,7 @@ ; GFX10-LABEL: name: sample_c_d_o_2darray_V2 ; GFX10: bb.1.main_body: ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 @@ -3314,14 +3322,14 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32, half, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32, half, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -334,25 +334,27 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { ; GFX9-LABEL: gather4_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, s12 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -364,47 +366,52 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -416,29 +423,32 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 +; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec @@ -450,14 +460,15 @@ ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] @@ -484,6 +495,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 @@ -491,11 +503,11 @@ ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec @@ -507,14 +519,15 @@ ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, s12 ; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] @@ -541,6 +554,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v5, s12 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 @@ -548,7 +562,7 @@ ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret <4 x float> %v } @@ -750,10 +764,10 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 immarg, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 immarg, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 immarg, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 immarg, float, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 immarg, float, float, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 immarg, float, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 -declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 immarg, float, float, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 immarg, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 immarg, half, float, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 immarg, half, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 immarg, half, float, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 immarg, half, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 immarg, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f16(i32 immarg, float, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -166,7 +166,7 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { ; GFX9-LABEL: gather4_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -189,11 +189,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -216,11 +216,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -245,11 +245,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -275,7 +275,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } @@ -373,10 +373,10 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32, half, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -320,7 +320,7 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { +define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) { ; GFX9-LABEL: sample_b_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -339,11 +339,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { ; GFX9-LABEL: sample_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -366,11 +366,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { +define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) { ; GFX9-LABEL: sample_c_b_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -389,11 +389,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: sample_c_b_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -416,11 +416,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { +define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) { ; GFX9-LABEL: sample_b_cl_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -443,11 +443,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -472,11 +472,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { +define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) { ; GFX9-LABEL: sample_c_b_cl_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -499,11 +499,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_c_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec @@ -529,7 +529,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } @@ -1229,14 +1229,14 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32, half, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32, half, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32, half, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1