Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -23,8 +23,6 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, SDValue Chain, unsigned Offset, bool Signed) const; - SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, - SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1366,14 +1366,6 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), @@ -1555,15 +1547,6 @@ return AMDGPUTargetLowering::LowerLOAD(Op, DAG); } -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); -} - SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() != MVT::i64) return SDValue(); Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2442,38 +2442,6 @@ (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) >; -multiclass ImageLoadPatterns { - def : ImageLoadPattern ; - def : ImageLoadArrayPattern ; -} - -multiclass ImageLoadMSAAPatterns { - def : ImageLoadMSAAPattern ; - def : ImageLoadArrayMSAAPattern ; -} - -defm : ImageLoadPatterns; -defm : ImageLoadPatterns; - -defm : ImageLoadMSAAPatterns; -defm : ImageLoadMSAAPatterns; - -/* Image resource information */ -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -172,16 +172,6 @@ def int_SI_image_load_mip : Image; def int_SI_getresinfo : Image; - // Deprecated image and sample intrinsics. - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - /* Interpolation Intrinsics */ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -4,29 +4,25 @@ ; GCN-LABEL: {{^}}main: ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1 - define void @main() #0 { -main_body: - %0 = fptosi float undef to i32 - %1 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> undef, <32 x i8> undef, i32 2) - %2 = extractelement <4 x i32> %1, i32 0 - %3 = and i32 %0, 7 - %4 = shl i32 1, %3 - %5 = and i32 %2, %4 - %6 = icmp eq i32 %5, 0 - %.10 = select i1 %6, float 0.000000e+00, float undef - %7 = call i32 @llvm.SI.packf16(float undef, float %.10) - %8 = bitcast i32 %7 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %8, float undef, float %8) +bb: + %tmp = fptosi float undef to i32 + %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp2.f = extractelement <4 x float> %tmp1, i32 0 + %tmp2 = bitcast float %tmp2.f to i32 + %tmp3 = and i32 %tmp, 7 + %tmp4 = shl i32 1, %tmp3 + %tmp5 = and i32 %tmp2, %tmp4 + %tmp6 = icmp eq i32 %tmp5, 0 + %tmp7 = select i1 %tmp6, float 0.000000e+00, float undef + %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7) + %tmp9 = bitcast i32 %tmp8 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9) ret void } -; Function Attrs: nounwind readnone -declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 - -; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare i32 @llvm.SI.packf16(float, float) #1 - declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } Index: test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll @@ -0,0 +1,96 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s + +; CHECK-LABEL: {{^}}v1: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 +define void @v1(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v2: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 +define void @v2(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v3: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 +define void @v3(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v4: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 +define void @v4(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v5: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 +define void @v5(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v6: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 +define void @v6(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v7: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 +define void @v7(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.SI.imageload.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.imageload.ll +++ /dev/null @@ -1,132 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1, - <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2, - <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3, - <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4, - <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5, - <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6, - <32 x i8> undef, i32 6) - %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10, - <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11, - <32 x i8> undef, i32 11) - %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15, - <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16, - <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s9 = add i32 %s5, %e10 - %s10 = add i32 %s9, %e11 - %s14 = add i32 %s10, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -; Test that ccordinates are stored in vgprs and not sgprs -; CHECK: vgpr_coords -; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}} -define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0 - %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2 - %22 = getelementptr float, float addrspace(2)* %21, i32 0 - %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1 - %24 = getelementptr float, float addrspace(2)* %21, i32 1 - %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1 - %26 = getelementptr float, float addrspace(2)* %21, i32 4 - %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1 - %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2 - %30 = bitcast float %27 to i32 - %31 = bitcast float %23 to i32 - %32 = bitcast float %25 to i32 - %33 = insertelement <4 x i32> undef, i32 %31, i32 0 - %34 = insertelement <4 x i32> %33, i32 %32, i32 1 - %35 = insertelement <4 x i32> %34, i32 %30, i32 2 - %36 = insertelement <4 x i32> %35, i32 undef, i32 3 - %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2) - %38 = extractelement <4 x i32> %37, i32 0 - %39 = extractelement <4 x i32> %37, i32 1 - %40 = extractelement <4 x i32> %37, i32 2 - %41 = extractelement <4 x i32> %37, i32 3 - %42 = bitcast i32 %38 to float - %43 = bitcast i32 %39 to float - %44 = bitcast i32 %40 to float - %45 = bitcast i32 %41 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45) - ret void -} - -declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone -; Function Attrs: nounwind readnone -declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null} -!1 = !{} -!2 = !{!0, !0, i64 0, i32 1} Index: test/CodeGen/AMDGPU/llvm.SI.resinfo.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.resinfo.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, - i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) { - %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6) - %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7) - %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8) - %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9) - %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11) - %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12) - %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13) - %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14) - %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t4 = extractelement <4 x i32> %res7, i32 0 - %t5 = extractelement <4 x i32> %res7, i32 3 - %e7 = add i32 %t4, %t5 - %t6 = extractelement <4 x i32> %res8, i32 1 - %t7 = extractelement <4 x i32> %res8, i32 2 - %e8 = add i32 %t6, %t7 - %t8 = extractelement <4 x i32> %res9, i32 1 - %t9 = extractelement <4 x i32> %res9, i32 3 - %e9 = add i32 %t8, %t9 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t16 = extractelement <4 x i32> %res12, i32 0 - %t17 = extractelement <4 x i32> %res12, i32 1 - %t18 = extractelement <4 x i32> %res12, i32 3 - %t19 = add i32 %t16, %t17 - %e12 = add i32 %t18, %t19 - %t20 = extractelement <4 x i32> %res13, i32 0 - %t21 = extractelement <4 x i32> %res13, i32 2 - %t22 = extractelement <4 x i32> %res13, i32 3 - %t23 = add i32 %t20, %t21 - %e13 = add i32 %t22, %t23 - %t24 = extractelement <4 x i32> %res14, i32 1 - %t25 = extractelement <4 x i32> %res14, i32 2 - %t26 = extractelement <4 x i32> %res14, i32 3 - %t27 = add i32 %t24, %t25 - %e14 = add i32 %t26, %t27 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s6 = add i32 %s5, %e7 - %s7 = add i32 %s6, %e8 - %s8 = add i32 %s7, %e9 - %s9 = add i32 %s8, %e10 - %s10 = add i32 %s9, %e11 - %s11 = add i32 %s10, %e12 - %s12 = add i32 %s11, %e13 - %s13 = add i32 %s12, %e14 - %s14 = add i32 %s13, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll +++ /dev/null @@ -1,96 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s - -; CHECK-LABEL: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v2: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 -define void @v2(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v3: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -define void @v3(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v4: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 -define void @v4(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v5: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -define void @v5(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v6: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 -define void @v6(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v7: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 -define void @v7(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.SI.sample.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.sample.ll +++ /dev/null @@ -1,160 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -; CHECK: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - %5 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5) - ret void -} - - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.SI.sampled.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.sampled.ll +++ /dev/null @@ -1,143 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -4,10 +4,14 @@ ; This test checks that no VGPR to SGPR copies are created by the register ; allocator. + +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + + ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] -define void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -31,7 +35,7 @@ ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -50,8 +54,8 @@ %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 84) %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 88) %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 92) - %tmp36 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %arg2, i32 0 - %tmp37 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp36, !tbaa !0 + %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 + %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0 %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0 %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5) @@ -63,7 +67,8 @@ %tmp46 = bitcast float %tmp41 to i32 %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1 - %tmp49 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp48, <32 x i8> %tmp37, <16 x i8> %tmp39, i32 2) + %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32> + %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp50 = extractelement <4 x float> %tmp49, i32 2 %tmp51 = call float @fabs(float %tmp50) %tmp52 = fmul float %tmp42, %tmp42 @@ -151,7 +156,7 @@ ; We just want ot make sure the program doesn't crash ; CHECK-LABEL: {{^}}loop: -define void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -200,7 +205,7 @@ declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1 ; Function Attrs: readnone declare float @llvm.amdgcn.rsq.f32(float) #3 @@ -223,27 +228,28 @@ ; CHECK: image_sample ; CHECK: exp ; CHECK: s_endpgm -define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 16) - %tmp23 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp24 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp23, !tbaa !0 + %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 + %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0 %tmp25 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 %tmp26 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp25, !tbaa !0 %tmp27 = fcmp oeq float %tmp22, 0.000000e+00 + %tmp26.bc = bitcast <16 x i8> %tmp26 to <4 x i32> br i1 %tmp27, label %if, label %else if: ; preds = %entry - %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> zeroinitializer, <32 x i8> %tmp24, <16 x i8> %tmp26, i32 2) + %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %val.if.0 = extractelement <4 x float> %val.if, i32 0 %val.if.1 = extractelement <4 x float> %val.if, i32 1 %val.if.2 = extractelement <4 x float> %val.if, i32 2 br label %endif else: ; preds = %entry - %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %tmp24, <16 x i8> %tmp26, i32 2) + %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %val.else.0 = extractelement <4 x float> %val.else, i32 0 %val.else.1 = extractelement <4 x float> %val.else, i32 1 %val.else.2 = extractelement <4 x float> %val.else, i32 2 @@ -286,7 +292,7 @@ ; This test is just checking that we don't crash / assertion fail. ; CHECK-LABEL: {{^}}copy2: ; CHECK: s_endpgm -define void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: br label %LOOP68 @@ -336,9 +342,8 @@ %tmp53 = bitcast float %tmp30 to i32 %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 - %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2) + %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32> + %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) br label %bb71 bb80: ; preds = %bb @@ -347,9 +352,8 @@ %tmp82.2 = add i32 %tmp82, 1 %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 - %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2) + %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32> + %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) br label %bb71 bb71: ; preds = %bb80, %bb38 Index: test/CodeGen/AMDGPU/si-lod-bias.ll =================================================================== --- test/CodeGen/AMDGPU/si-lod-bias.ll +++ test/CodeGen/AMDGPU/si-lod-bias.ll @@ -6,13 +6,13 @@ ; CHECK: {{^}}main: ; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}} -define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) - %tmp22 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %arg2, i32 0 - %tmp23 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp22, !tbaa !0 + %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 + %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0 %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0 %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5) @@ -24,7 +24,8 @@ %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1 %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2 %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3 - %tmp35 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %tmp34, <32 x i8> %tmp23, <16 x i8> %tmp25, i32 2) + %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32> + %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp36 = extractelement <4 x float> %tmp35, i32 0 %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 @@ -39,8 +40,8 @@ ; Function Attrs: nounwind readnone declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -23,7 +23,9 @@ %tmp28 = bitcast float %tmp26 to i32 %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0 %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1 - %tmp31 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp30, <32 x i8> %tmp22, <16 x i8> %tmp24, i32 2) + %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32> + %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32> + %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp32 = extractelement <4 x float> %tmp31, i32 0 %tmp33 = extractelement <4 x float> %tmp31, i32 1 %tmp34 = extractelement <4 x float> %tmp31, i32 2 @@ -39,8 +41,8 @@ ; Function Attrs: nounwind readnone declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + ; Function Attrs: nounwind readnone declare i32 @llvm.SI.packf16(float, float) #1 Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -22,7 +22,7 @@ ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm -define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -64,36 +64,37 @@ %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 372) %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 376) %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 384) - %tmp60 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp61 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp60, !tbaa !0 + %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 + %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0 %tmp62 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 %tmp63 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp62, !tbaa !0 - %tmp64 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 1 - %tmp65 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp64, !tbaa !0 + %tmp63.bc = bitcast <16 x i8> %tmp63 to <4 x i32> + %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 + %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0 %tmp66 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1 %tmp67 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp66, !tbaa !0 - %tmp68 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 2 - %tmp69 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp68, !tbaa !0 + %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 + %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0 %tmp70 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2 %tmp71 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp70, !tbaa !0 - %tmp72 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 3 - %tmp73 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp72, !tbaa !0 + %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 + %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0 %tmp74 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3 %tmp75 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp74, !tbaa !0 - %tmp76 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 4 - %tmp77 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp76, !tbaa !0 + %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 + %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0 %tmp78 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4 %tmp79 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp78, !tbaa !0 - %tmp80 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 5 - %tmp81 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp80, !tbaa !0 + %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 + %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0 %tmp82 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5 %tmp83 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp82, !tbaa !0 - %tmp84 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 6 - %tmp85 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp84, !tbaa !0 + %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 + %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0 %tmp86 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6 %tmp87 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp86, !tbaa !0 - %tmp88 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 7 - %tmp89 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp88, !tbaa !0 + %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 + %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0 %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7 %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0 %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6) @@ -272,7 +273,7 @@ %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5 %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6 %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7 - %tmp243 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp242, <32 x i8> %tmp61, <16 x i8> %tmp63, i32 2) + %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp244 = extractelement <4 x float> %tmp243, i32 3 %tmp245 = fcmp oge float %temp30.0, %tmp244 %tmp246 = sext i1 %tmp245 to i32 @@ -317,7 +318,8 @@ %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5 %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7 - %tmp277 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp276, <32 x i8> %tmp65, <16 x i8> %tmp67, i32 2) + %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32> + %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp278 = extractelement <4 x float> %tmp277, i32 0 %tmp279 = extractelement <4 x float> %tmp277, i32 1 %tmp280 = extractelement <4 x float> %tmp277, i32 2 @@ -337,7 +339,8 @@ %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5 %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7 - %tmp297 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp296, <32 x i8> %tmp81, <16 x i8> %tmp83, i32 2) + %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32> + %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp298 = extractelement <4 x float> %tmp297, i32 0 %tmp299 = extractelement <4 x float> %tmp297, i32 1 %tmp300 = extractelement <4 x float> %tmp297, i32 2 @@ -355,7 +358,8 @@ %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5 %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7 - %tmp315 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp314, <32 x i8> %tmp77, <16 x i8> %tmp79, i32 2) + %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32> + %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp316 = extractelement <4 x float> %tmp315, i32 0 %tmp317 = extractelement <4 x float> %tmp315, i32 1 %tmp318 = extractelement <4 x float> %tmp315, i32 2 @@ -385,7 +389,7 @@ %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5 %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6 %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7 - %tmp345 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp344, <32 x i8> %tmp61, <16 x i8> %tmp63, i32 2) + %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp346 = extractelement <4 x float> %tmp345, i32 0 %tmp347 = extractelement <4 x float> %tmp345, i32 1 %tmp348 = extractelement <4 x float> %tmp345, i32 2 @@ -415,7 +419,8 @@ %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5 %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7 - %tmp375 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp374, <32 x i8> %tmp69, <16 x i8> %tmp71, i32 2) + %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32> + %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp376 = extractelement <4 x float> %tmp375, i32 0 %tmp377 = extractelement <4 x float> %tmp375, i32 1 %tmp378 = extractelement <4 x float> %tmp375, i32 2 @@ -469,7 +474,8 @@ %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5 %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7 - %tmp429 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp428, <32 x i8> %tmp85, <16 x i8> %tmp87, i32 2) + %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32> + %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp430 = extractelement <4 x float> %tmp429, i32 0 %tmp431 = extractelement <4 x float> %tmp429, i32 1 %tmp432 = extractelement <4 x float> %tmp429, i32 2 @@ -510,7 +516,8 @@ %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1 %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3 - %tmp470 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %tmp469, <32 x i8> %tmp89, <16 x i8> %tmp91, i32 4) + %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32> + %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -611,7 +618,8 @@ %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5 %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7 - %tmp571 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp570, <32 x i8> %tmp73, <16 x i8> %tmp75, i32 2) + %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32> + %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp572 = extractelement <4 x float> %tmp571, i32 0 %tmp573 = extractelement <4 x float> %tmp571, i32 1 %tmp574 = extractelement <4 x float> %tmp571, i32 2 @@ -635,7 +643,7 @@ %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5 %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6 %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7 - %tmp591 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %tmp590, <32 x i8> %tmp61, <16 x i8> %tmp63, i32 2) + %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp592 = extractelement <4 x float> %tmp591, i32 3 %tmp593 = fcmp oge float %temp30.1, %tmp592 %tmp594 = sext i1 %tmp593 to i32 @@ -660,7 +668,7 @@ ; CHECK-LABEL: {{^}}main1: ; CHECK: s_endpgm -define void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -767,40 +775,40 @@ %tmp122 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 716) %tmp123 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 864) %tmp124 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 868) - %tmp125 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp126 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp125, !tbaa !0 + %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 + %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0 %tmp127 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 %tmp128 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp127, !tbaa !0 - %tmp129 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 1 - %tmp130 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp129, !tbaa !0 + %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 + %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0 %tmp131 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1 %tmp132 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp131, !tbaa !0 - %tmp133 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 2 - %tmp134 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp133, !tbaa !0 + %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 + %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0 %tmp135 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2 %tmp136 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp135, !tbaa !0 - %tmp137 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 3 - %tmp138 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp137, !tbaa !0 + %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 + %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0 %tmp139 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3 %tmp140 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp139, !tbaa !0 - %tmp141 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 4 - %tmp142 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp141, !tbaa !0 + %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 + %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0 %tmp143 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4 %tmp144 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp143, !tbaa !0 - %tmp145 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 5 - %tmp146 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp145, !tbaa !0 + %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 + %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0 %tmp147 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5 %tmp148 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp147, !tbaa !0 - %tmp149 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 6 - %tmp150 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp149, !tbaa !0 + %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 + %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0 %tmp151 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6 %tmp152 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp151, !tbaa !0 - %tmp153 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 7 - %tmp154 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp153, !tbaa !0 + %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 + %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0 %tmp155 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7 %tmp156 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp155, !tbaa !0 - %tmp157 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %arg2, i64 0, i32 8 - %tmp158 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp157, !tbaa !0 + %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8 + %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0 %tmp159 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 8 %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0 %tmp161 = fcmp ugt float %arg17, 0.000000e+00 @@ -868,7 +876,8 @@ %tmp222 = bitcast float %tmp174 to i32 %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1 - %tmp225 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp224, <32 x i8> %tmp130, <16 x i8> %tmp132, i32 2) + %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32> + %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp226 = extractelement <4 x float> %tmp225, i32 0 %tmp227 = extractelement <4 x float> %tmp225, i32 1 %tmp228 = extractelement <4 x float> %tmp225, i32 2 @@ -938,7 +947,8 @@ %tmp279 = insertelement <4 x i32> %tmp278, i32 %tmp277, i32 1 %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2 %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3 - %tmp282 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %tmp281, <32 x i8> %tmp146, <16 x i8> %tmp148, i32 2) + %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32> + %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp283 = extractelement <4 x float> %tmp282, i32 3 %tmp284 = fadd float %temp168.0, %tmp273 %tmp285 = fadd float %temp169.0, %tmp274 @@ -1001,7 +1011,8 @@ %tmp339 = bitcast float %tmp335 to i32 %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1 - %tmp342 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp341, <32 x i8> %tmp134, <16 x i8> %tmp136, i32 2) + %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32> + %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp343 = extractelement <4 x float> %tmp342, i32 0 %tmp344 = extractelement <4 x float> %tmp342, i32 1 %tmp345 = extractelement <4 x float> %tmp342, i32 2 @@ -1033,7 +1044,8 @@ %tmp359 = bitcast float %tmp337 to i32 %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1 - %tmp362 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp361, <32 x i8> %tmp150, <16 x i8> %tmp152, i32 2) + %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32> + %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp363 = extractelement <4 x float> %tmp362, i32 2 %tmp364 = fmul float %result.i40, %result.i %tmp365 = fmul float %result.i36, %result.i44 @@ -1043,7 +1055,8 @@ %tmp369 = bitcast float %tmp311 to i32 %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1 - %tmp372 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp371, <32 x i8> %tmp138, <16 x i8> %tmp140, i32 2) + %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32> + %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp373 = extractelement <4 x float> %tmp372, i32 0 %tmp374 = extractelement <4 x float> %tmp372, i32 1 %tmp375 = extractelement <4 x float> %tmp372, i32 2 @@ -1059,7 +1072,8 @@ %tmp383 = bitcast float %tmp321 to i32 %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1 - %tmp386 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp385, <32 x i8> %tmp142, <16 x i8> %tmp144, i32 2) + %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32> + %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp387 = extractelement <4 x float> %tmp386, i32 0 %tmp388 = extractelement <4 x float> %tmp386, i32 1 %tmp389 = extractelement <4 x float> %tmp386, i32 2 @@ -1155,7 +1169,8 @@ %tmp467 = bitcast float %tmp220 to i32 %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1 - %tmp470 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp469, <32 x i8> %tmp158, <16 x i8> %tmp160, i32 2) + %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32> + %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -1172,7 +1187,8 @@ %tmp484 = bitcast float %tmp172 to i32 %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1 - %tmp487 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp486, <32 x i8> %tmp154, <16 x i8> %tmp156, i32 2) + %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32> + %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp488 = extractelement <4 x float> %tmp487, i32 0 %tmp489 = extractelement <4 x float> %tmp487, i32 1 %tmp490 = extractelement <4 x float> %tmp487, i32 2 @@ -1377,7 +1393,8 @@ %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1 %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3 - %tmp660 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %tmp659, <32 x i8> %tmp126, <16 x i8> %tmp128, i32 2) + %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32> + %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp661 = extractelement <4 x float> %tmp660, i32 0 %tmp662 = extractelement <4 x float> %tmp660, i32 1 %tmp663 = bitcast float %tmp646 to i32 @@ -1387,7 +1404,7 @@ %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1 %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2 %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3 - %tmp670 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %tmp669, <32 x i8> %tmp126, <16 x i8> %tmp128, i32 2) + %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp671 = extractelement <4 x float> %tmp670, i32 0 %tmp672 = extractelement <4 x float> %tmp670, i32 1 %tmp673 = fsub float -0.000000e+00, %tmp662 @@ -1549,10 +1566,11 @@ declare float @llvm.AMDIL.clamp.(float, float, float) #1 ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #2 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #2 +declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 + ; Function Attrs: readnone declare float @llvm.AMDIL.exp.(float) #1 @@ -1573,7 +1591,7 @@ declare float @llvm.amdgcn.rsq.f32(float) #2 ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #2 +declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 ; Function Attrs: readnone declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 @@ -1582,7 +1600,8 @@ declare float @fabs(float) #1 ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #2 +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 + ; Function Attrs: nounwind readnone declare float @llvm.pow.f32(float, float) #2 Index: test/CodeGen/AMDGPU/subreg-coalescer-crash.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -67,7 +67,7 @@ br label %bb4 bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2) + %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp11 = extractelement <4 x float> %tmp10, i32 1 %tmp12 = extractelement <4 x float> %tmp10, i32 3 br label %bb14 @@ -98,7 +98,7 @@ } ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 ; Function Attrs: nounwind readnone declare i32 @llvm.SI.packf16(float, float) #1