Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -11,6 +11,10 @@ // //===----------------------------------------------------------------------===// +class AMDGPUReadPreloadRegisterIntrinsic + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, + GCCBuiltin; + let TargetPrefix = "r600" in { class R600ReadPreloadRegisterIntrinsic @@ -41,15 +45,30 @@ Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, GCCBuiltin<"__builtin_r600_rat_store_typed">; +def int_r600_rsq : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] +>; + +def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic < + "__builtin_r600_read_workdim" +>; + } // End TargetPrefix = "r600" +// FIXME: These should be renamed/moved to r600 let TargetPrefix = "AMDGPU" in { +def int_AMDGPU_rsq_clamped : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] +>; -class AMDGPUReadPreloadRegisterIntrinsic - : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, - GCCBuiltin; +def int_AMDGPU_ldexp : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] +>; +} -def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">, +let TargetPrefix = "amdgcn" in { + +def int_amdgcn_div_scale : GCCBuiltin<"__builtin_amdgcn_div_scale">, // 1st parameter: Numerator // 2nd parameter: Denominator // 3rd parameter: Constant to select select between first and @@ -58,43 +77,39 @@ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], [IntrNoMem]>; -def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">, +def int_amdgcn_div_fmas : GCCBuiltin<"__builtin_amdgcn_div_fmas">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], [IntrNoMem]>; -def int_AMDGPU_div_fixup : GCCBuiltin<"__builtin_amdgpu_div_fixup">, +def int_amdgcn_div_fixup : GCCBuiltin<"__builtin_amdgcn_div_fixup">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; -def int_AMDGPU_trig_preop : GCCBuiltin<"__builtin_amdgpu_trig_preop">, +def int_amdgcn_trig_preop : GCCBuiltin<"__builtin_amdgcn_trig_preop">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>; -def int_AMDGPU_rcp : GCCBuiltin<"__builtin_amdgpu_rcp">, +def int_amdgcn_rcp : GCCBuiltin<"__builtin_amdgcn_rcp">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -def int_AMDGPU_rsq : GCCBuiltin<"__builtin_amdgpu_rsq">, +def int_amdgcn_rsq : GCCBuiltin<"__builtin_amdgcn_rsq">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">, +def int_amdgcn_rsq_clamped : GCCBuiltin<"__builtin_amdgcn_rsq_clamped">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -def int_AMDGPU_ldexp : GCCBuiltin<"__builtin_amdgpu_ldexp">, +def int_amdgcn_ldexp : GCCBuiltin<"__builtin_amdgcn_ldexp">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>; -def int_AMDGPU_class : GCCBuiltin<"__builtin_amdgpu_class">, +def int_amdgcn_class : GCCBuiltin<"__builtin_amdgcn_class">, Intrinsic<[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; -def int_AMDGPU_read_workdim : AMDGPUReadPreloadRegisterIntrinsic < - "__builtin_amdgpu_read_workdim">; - -} // End TargetPrefix = "AMDGPU" +def int_amdgcn_read_workdim : AMDGPUReadPreloadRegisterIntrinsic < + "__builtin_amdgcn_read_workdim">; -let TargetPrefix = "amdgcn" in { -// SI only def int_amdgcn_buffer_wbinvl1_sc : GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, Intrinsic<[], [], []>; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -925,7 +925,7 @@ return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_div_scale: { + case Intrinsic::amdgcn_div_scale: { // 3rd parameter required to be a constant. const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); if (!Param) @@ -947,28 +947,29 @@ Denominator, Numerator); } - case Intrinsic::AMDGPU_div_fmas: + case Intrinsic::amdgcn_div_fmas: return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); - case Intrinsic::AMDGPU_div_fixup: + case Intrinsic::amdgcn_div_fixup: return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_trig_preop: + case Intrinsic::amdgcn_trig_preop: return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::AMDGPU_rcp: + case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); - case Intrinsic::AMDGPU_rsq: + case Intrinsic::amdgcn_rsq: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_legacy_rsq: return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_clamped: case Intrinsic::AMDGPU_rsq_clamped: if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { Type *Type = VT.getTypeForEVT(*DAG.getContext()); @@ -984,7 +985,8 @@ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); } - case Intrinsic::AMDGPU_ldexp: + case Intrinsic::amdgcn_ldexp: + case Intrinsic::AMDGPU_ldexp: // Legacy name return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -1039,7 +1041,7 @@ Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::AMDGPU_class: + case Intrinsic::amdgcn_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), Op.getOperand(2)); Index: lib/Target/AMDGPU/AMDGPUIntrinsics.td =================================================================== --- lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -61,10 +61,17 @@ def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of llvm.bitreverse + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of llvm.amdgcn.s.barrier def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; + + // Deprecated in favor of llvm.amdgcn.read.workdim + def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; } // Legacy names for compatibility. Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -825,7 +825,8 @@ case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: { + case Intrinsic::r600_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); } @@ -848,7 +849,12 @@ case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: + + // FIXME: Should be renamed to r600 prefix + case Intrinsic::AMDGPU_rsq_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + + case Intrinsic::r600_rsq: // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1312,7 +1312,8 @@ case Intrinsic::r600_read_local_size_z: return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); - case Intrinsic::AMDGPU_read_workdim: + case Intrinsic::amdgcn_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. // Really only 2 bits. return lowerImplicitZextParam(DAG, Op, MVT::i8, getImplicitParameterOffset(MFI, GRID_DIM)); Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1614,7 +1614,7 @@ break; } - case Intrinsic::AMDGPU_rcp: { + case Intrinsic::amdgcn_rcp: { if (const ConstantFP *C = dyn_cast(II->getArgOperand(0))) { const APFloat &ArgVal = C->getValueAPF(); APFloat Val(ArgVal.getSemantics(), 1.0); Index: test/CodeGen/AMDGPU/big_alu.ll =================================================================== --- test/CodeGen/AMDGPU/big_alu.ll +++ test/CodeGen/AMDGPU/big_alu.ll @@ -100,7 +100,7 @@ %88 = insertelement <4 x float> %87, float %32, i32 2 %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) - %91 = call float @llvm.AMDGPU.rsq.f32(float %90) + %91 = call float @llvm.AMDGPU.rsq.clamped.f32(float %90) %92 = fmul float %30, %91 %93 = fmul float %31, %91 %94 = fmul float %32, %91 @@ -343,7 +343,7 @@ %325 = insertelement <4 x float> %324, float %318, i32 2 %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) - %328 = call float @llvm.AMDGPU.rsq.f32(float %327) + %328 = call float @llvm.AMDGPU.rsq.clamped.f32(float %327) %329 = fmul float %314, %328 %330 = fmul float %316, %328 %331 = fmul float %318, %328 @@ -376,7 +376,7 @@ %358 = insertelement <4 x float> %357, float %45, i32 2 %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) - %361 = call float @llvm.AMDGPU.rsq.f32(float %360) + %361 = call float @llvm.AMDGPU.rsq.clamped.f32(float %360) %362 = fmul float %45, %361 %363 = call float @fabs(float %362) %364 = fmul float %176, 0x3FECCCCCC0000000 @@ -402,7 +402,7 @@ %384 = insertelement <4 x float> %383, float %45, i32 2 %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) - %387 = call float @llvm.AMDGPU.rsq.f32(float %386) + %387 = call float @llvm.AMDGPU.rsq.clamped.f32(float %386) %388 = fmul float %45, %387 %389 = call float @fabs(float %388) %390 = fmul float %176, 0x3FF51EB860000000 @@ -1040,7 +1040,7 @@ %896 = insertelement <4 x float> %895, float %45, i32 2 %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) - %899 = call float @llvm.AMDGPU.rsq.f32(float %898) + %899 = call float @llvm.AMDGPU.rsq.clamped.f32(float %898) %900 = fmul float %45, %899 %901 = call float @fabs(float %900) %902 = fmul float %176, 0x3FECCCCCC0000000 @@ -1149,7 +1149,7 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 +declare float @llvm.AMDGPU.rsq.clamped.f32(float) #1 ; Function Attrs: readnone declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll +++ /dev/null @@ -1,499 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 -declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare double @llvm.fabs.f64(double) #1 - -; SI-LABEL: {{^}}test_class_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fneg = fsub float -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %a.fneg.fabs = fsub float -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 10 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_9bit_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f32: -; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; FIXME: Why isn't this using a literal constant operand? -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fneg = fsub double -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %a.fneg.fabs = fsub double -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} -; SI: s_endpgm -define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} -; SI: s_endpgm -define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 9 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f64: -; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NOT: vcc -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f64: -; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NOT: vcc -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load double, double addrspace(1)* %in - - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: -; XSI: v_cmp_class_f64_e32 vcc, 1.0, -; SI: v_cmp_class_f64_e32 vcc, -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: -; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or3_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - - %sext = sext i1 %or.1 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 - %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 - %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 - %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 - %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - %or.2 = or i1 %or.1, %class3 - %or.3 = or i1 %or.2, %class4 - %or.4 = or i1 %or.3, %class5 - %or.5 = or i1 %or.4, %class6 - %or.6 = or i1 %or.5, %class7 - %or.7 = or i1 %or.6, %class8 - %or.8 = or i1 %or.7, %class9 - %sext = sext i1 %or.8 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_1: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_2: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} -; SI: s_or_b64 -; SI: s_endpgm -define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f32: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f64: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone -declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fixup_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fixup_f64: -; GCN: v_div_fixup_f64 -define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { - %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll +++ /dev/null @@ -1,178 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s - -; FIXME: Enable for VI. - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone -declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fmas_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f64: -; GCN: v_div_fmas_f64 -define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { - %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { - %cmp = icmp eq i32 %i, 0 - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: -; SI: s_mov_b64 vcc, 0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: -; SI: s_mov_b64 vcc, -1 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} -; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} -; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] -; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] -; SI: s_endpgm -define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - %cmp1 = icmp ne i32 %d, 0 - %and = and i1 %cmp0, %cmp1 - - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: -; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] - -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc - - -; SI: BB9_2: -; SI: s_or_b64 exec, exec, [[SAVE]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword -; SI: s_endpgm -define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - br i1 %cmp0, label %bb, label %exit - -bb: - %val = load i32, i32 addrspace(1)* %dummy - %cmp1 = icmp ne i32 %val, 0 - br label %exit - -exit: - %cond = phi i1 [false, %entry], [%cmp1, %bb] - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll +++ /dev/null @@ -1,364 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone -declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; SI-LABEL @test_div_scale_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_1: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_2: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_1: -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_2: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_1: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_2: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_1: -; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] -; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] -; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone -declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_ldexp_f32: -; SI: v_ldexp_f32 -; SI: s_endpgm -define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { - %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_ldexp_f64: -; SI: v_ldexp_f64 -; SI: s_endpgm -define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { - %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f64: -; SI: v_rcp_f64_e32 -define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rcp_pat_f64: -; SI: v_rcp_f64_e32 -define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = fdiv double 1.0, %src - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: -; SI-UNSAFE: v_rsq_f64_e32 -; SI-SAFE-NOT: v_rsq_f64_e32 -; SI-SAFE: v_sqrt_f64 -; SI-SAFE: v_rcp_f64 -define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone - %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s - -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone - -declare float @llvm.sqrt.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f32: -; SI: v_rcp_f32_e32 -; EG: RECIP_IEEE -define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FIXME: Evergreen only ever does unsafe fp math. -; FUNC-LABEL: {{^}}rcp_pat_f32: - -; SI-SAFE: v_rcp_f32_e32 -; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 - -; EG: RECIP_IEEE - -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = fdiv float 1.0, %src - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: -; SI-UNSAFE: v_rsq_f32_e32 -; SI-SAFE: v_sqrt_f32_e32 -; SI-SAFE: v_rcp_f32_e32 - -; EG: RECIPSQRT_IEEE -define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone - %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}read_workdim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV * [[VAL]], KC0[2].Z - -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NOHSA: buffer_store_dword [[VVAL]] -define void @read_workdim(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}read_workdim_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN-NOT: 0xff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @read_workdim_known_bits(i32 addrspace(1)* %out) { -entry: - %dim = call i32 @llvm.AMDGPU.read.workdim() #0 - %shl = shl i32 %dim, 24 - %shr = lshr i32 %shl, 24 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -declare i32 @llvm.AMDGPU.read.workdim() #0 - -attributes #0 = { readnone } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f64: -; SI: v_rsq_clamp_f64_e32 - -; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] -; TODO: this constant should be folded: -; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 -; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] - -define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { - %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone - store double %rsq_clamped, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: Uses of this should be moved to llvm.amdgcn.rsq.clamped, and +; an r600 variant added. declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; EG: RECIPSQRT_IEEE -define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; TODO: Really these should be constant folded -; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_trig_preop_f64: -; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] -; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: -; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -0,0 +1,499 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i1 @llvm.amdgcn.class.f32(float, i32) #1 +declare i1 @llvm.amdgcn.class.f64(double, i32) #1 +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 + +; SI-LABEL: {{^}}test_class_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fneg = fsub float -0.0, %a + %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %a.fneg.fabs = fsub float -0.0, %a.fabs + %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 10 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_9bit_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f32: +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; FIXME: Why isn't this using a literal constant operand? +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fneg = fsub double -0.0, %a + %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %a.fneg.fabs = fsub double -0.0, %a.fabs + %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} +; SI: s_endpgm +define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} +; SI: s_endpgm +define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 9 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f64: +; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f64: +; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %in + + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: +; XSI: v_cmp_class_f64_e32 vcc, 1.0, +; SI: v_cmp_class_f64_e32 vcc, +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: +; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or3_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + + %sext = sext i1 %or.1 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1 + %class3 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1 + %class4 = call i1 @llvm.amdgcn.class.f32(float %a, i32 16) #1 + %class5 = call i1 @llvm.amdgcn.class.f32(float %a, i32 32) #1 + %class6 = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 + %class7 = call i1 @llvm.amdgcn.class.f32(float %a, i32 128) #1 + %class8 = call i1 @llvm.amdgcn.class.f32(float %a, i32 256) #1 + %class9 = call i1 @llvm.amdgcn.class.f32(float %a, i32 512) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + %or.2 = or i1 %or.1, %class3 + %or.3 = or i1 %or.2, %class4 + %or.4 = or i1 %or.3, %class5 + %or.5 = or i1 %or.4, %class6 + %or.6 = or i1 %or.5, %class7 + %or.7 = or i1 %or.6, %class8 + %or.8 = or i1 %or.7, %class9 + %sext = sext i1 %or.8 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_1: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_2: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} +; SI: s_or_b64 +; SI: s_endpgm +define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f32: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f64: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone +declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fixup_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fixup_f64: +; GCN: v_div_fixup_f64 +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -0,0 +1,178 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s + +; FIXME: Enable for VI. + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone +declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fmas_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f64: +; GCN: v_div_fmas_f64 +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { + %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: +; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { + %cmp = icmp eq i32 %i, 0 + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: +; SI: s_mov_b64 vcc, 0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: +; SI: s_mov_b64 vcc, -1 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} +; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] +; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] +; SI: s_endpgm +define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + %cmp1 = icmp ne i32 %d, 0 + %and = and i1 %cmp0, %cmp1 + + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: +; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] + +; SI: buffer_load_dword [[LOAD:v[0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc + + +; SI: BB9_2: +; SI: s_or_b64 exec, exec, [[SAVE]] +; SI: v_cmp_ne_i32_e32 vcc, 0, v0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: buffer_store_dword +; SI: s_endpgm +define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + br i1 %cmp0, label %bb, label %exit + +bb: + %val = load i32, i32 addrspace(1)* %dummy + %cmp1 = icmp ne i32 %val, 0 + br label %exit + +exit: + %cond = phi i1 [false, %entry], [%cmp1, %bb] + %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -0,0 +1,364 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; SI-LABEL @test_div_scale_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_1: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_2: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_1: +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_2: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_1: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_2: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_1: +; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] +; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] +; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.amdgcn.ldexp.f32(float, i32) nounwind readnone +declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone + +declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone +declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_ldexp_f32: +; SI: v_ldexp_f32 +; SI: s_endpgm +define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { + %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_ldexp_f64: +; SI: v_ldexp_f64 +; SI: s_endpgm +define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { + %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}test_legacy_ldexp_f32: +; SI: v_ldexp_f32 +; SI: s_endpgm +define void @test_legacy_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { + %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_legacy_ldexp_f64: +; SI: v_ldexp_f64 +; SI: s_endpgm +define void @test_legacy_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { + %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -0,0 +1,73 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.amdgcn.rcp.f32(float) #0 +declare double @llvm.amdgcn.rcp.f64(double) #0 + +declare double @llvm.sqrt.f64(double) #0 +declare float @llvm.sqrt.f32(float) #0 + + +; FUNC-LABEL: {{^}}rcp_f32: +; SI: v_rcp_f32_e32 +define void @rcp_f32(float addrspace(1)* %out, float %src) #1 { + %rcp = call float @llvm.amdgcn.rcp.f32(float %src) #0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_pat_f32: + +; SI-SAFE: v_rcp_f32_e32 +; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: +; SI-UNSAFE: v_rsq_f32_e32 +; SI-SAFE: v_sqrt_f32_e32 +; SI-SAFE: v_rcp_f32_e32 +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { + %sqrt = call float @llvm.sqrt.f32(float %src) #0 + %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) #0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_f64: +; SI: v_rcp_f64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) #1 { + %rcp = call double @llvm.amdgcn.rcp.f64(double %src) #0 + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rcp_pat_f64: +; SI: v_rcp_f64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: +; SI-UNSAFE: v_rsq_f64_e32 +; SI-SAFE-NOT: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64 +; SI-SAFE: v_rcp_f64 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { + %sqrt = call double @llvm.sqrt.f64(double %src) #0 + %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) #0 + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA %s + +; GCN-LABEL: {{^}}read_workdim: +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.amdgcn.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.amdgcn.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}legacy_read_workdim: +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @legacy_read_workdim(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %dim, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.read.workdim() #0 +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamped.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamped.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s + +declare double @llvm.amdgcn.rsq.clamped.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_clamped_f64: +; SI: v_rsq_clamp_f64_e32 + +; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] +; TODO: this constant should be folded: +; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff +; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] +; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] + +define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { + %rsq_clamped = call double @llvm.amdgcn.rsq.clamped.f64(double %src) nounwind readnone + store double %rsq_clamped, double addrspace(1)* %out, align 8 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.amdgcn.rsq.f32(float) #0 +declare double @llvm.amdgcn.rsq.f64(double) #0 + +; FUNC-LABEL: {{^}}rsq_f32: +; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +define void @rsq_f32(float addrspace(1)* %out, float %src) #1 { + %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 +define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 +define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_f64: +; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @rsq_f64(double addrspace(1)* %out, double %src) #1 { + %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0 + store double %rsq, double addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_f64_constant_4.0 +; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0 +define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { + %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0 + store double %rsq, double addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_f64_constant_100.0 +; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000 +; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}} +; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { + %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0 + store double %rsq, double addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_trig_preop_f64: +; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] +; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: +; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} Index: test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + +; EG-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].Z +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.r600.read.workdim() #0 + store i32 %dim, i32 addrspace(1)* %out + ret void +} + +; EG-LABEL: {{^}}read_workdim_known_bits: +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.r600.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; EG-LABEL: {{^}}legacy_read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV * [[VAL]], KC0[2].Z +define void @legacy_read_workdim(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %dim, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.workdim() #0 +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/pv.ll =================================================================== --- test/CodeGen/AMDGPU/pv.ll +++ test/CodeGen/AMDGPU/pv.ll @@ -103,7 +103,7 @@ %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq.f32(float %97) + %98 = call float @llvm.AMDGPU.rsq.clamped.f32(float %97) %99 = fmul float %4, %98 %100 = fmul float %5, %98 %101 = fmul float %6, %98 @@ -119,10 +119,10 @@ %111 = extractelement <4 x float> %110, i32 2 %112 = fmul float %111, %10 %113 = fadd float %112, %22 - %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00) - %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00) - %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00) - %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00) + %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00) + %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00) + %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00) %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) %119 = extractelement <4 x float> %118, i32 0 %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) @@ -202,9 +202,9 @@ %194 = fadd float %193, %188 %195 = fmul float %181, %174 %196 = fadd float %195, %190 - %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00) - %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00) + %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00) + %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00) + %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00) %200 = insertelement <4 x float> undef, float %75, i32 0 %201 = insertelement <4 x float> %200, float %79, i32 1 %202 = insertelement <4 x float> %201, float %83, i32 2 @@ -225,10 +225,10 @@ declare float @fabs(float) #2 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 +declare float @llvm.AMDGPU.rsq.clamped.f32(float) #1 ; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 +declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1 ; Function Attrs: nounwind readonly declare float @llvm.pow.f32(float, float) #3 Index: test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/rcp-pattern.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FIXME: Evergreen only ever does unsafe fp math. +; FUNC-LABEL: {{^}}rcp_pat_f32: +; EG: RECIP_IEEE +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -71,7 +71,7 @@ %55 = fadd float %54, %53 %56 = fmul float %45, %45 %57 = fadd float %55, %56 - %58 = call float @llvm.AMDGPU.rsq.f32(float %57) + %58 = call float @llvm.amdgcn.rsq.f32(float %57) %59 = fmul float %43, %58 %60 = fmul float %44, %58 %61 = fmul float %45, %58 @@ -213,7 +213,7 @@ declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #3 +declare float @llvm.amdgcn.rsq.f32(float) #3 ; Function Attrs: readnone declare float @llvm.AMDIL.exp.(float) #3 Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -215,7 +215,7 @@ %198 = fadd float %197, %196 %199 = fmul float %97, %97 %200 = fadd float %198, %199 - %201 = call float @llvm.AMDGPU.rsq.f32(float %200) + %201 = call float @llvm.amdgcn.rsq.f32(float %200) %202 = fmul float %95, %201 %203 = fmul float %96, %201 %204 = fmul float %202, %29 @@ -396,7 +396,7 @@ %355 = fadd float %354, %353 %356 = fmul float %352, %352 %357 = fadd float %355, %356 - %358 = call float @llvm.AMDGPU.rsq.f32(float %357) + %358 = call float @llvm.amdgcn.rsq.f32(float %357) %359 = fmul float %350, %358 %360 = fmul float %351, %358 %361 = fmul float %352, %358 @@ -524,7 +524,7 @@ %483 = fadd float %482, %481 %484 = fmul float %109, %109 %485 = fadd float %483, %484 - %486 = call float @llvm.AMDGPU.rsq.f32(float %485) + %486 = call float @llvm.amdgcn.rsq.f32(float %485) %487 = fmul float %107, %486 %488 = fmul float %108, %486 %489 = fmul float %109, %486 @@ -553,7 +553,7 @@ %512 = fadd float %511, %510 %513 = fmul float %97, %97 %514 = fadd float %512, %513 - %515 = call float @llvm.AMDGPU.rsq.f32(float %514) + %515 = call float @llvm.amdgcn.rsq.f32(float %514) %516 = fmul float %95, %515 %517 = fmul float %96, %515 %518 = fmul float %97, %515 @@ -670,7 +670,7 @@ declare float @ceil(float) #3 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #2 +declare float @llvm.amdgcn.rsq.f32(float) #2 ; Function Attrs: nounwind readnone declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 @@ -899,7 +899,7 @@ %212 = fadd float %211, %210 %213 = fmul float %209, %209 %214 = fadd float %212, %213 - %215 = call float @llvm.AMDGPU.rsq.f32(float %214) + %215 = call float @llvm.amdgcn.rsq.f32(float %214) %216 = fmul float %205, %215 %217 = fmul float %207, %215 %218 = fmul float %209, %215 @@ -1135,7 +1135,7 @@ %434 = fsub float -0.000000e+00, %433 %435 = fadd float 0x3FF00068E0000000, %434 %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) - %437 = call float @llvm.AMDGPU.rsq.f32(float %436) + %437 = call float @llvm.amdgcn.rsq.f32(float %436) %438 = fmul float %437, %436 %439 = fsub float -0.000000e+00, %436 %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) @@ -1159,7 +1159,7 @@ %458 = fadd float %457, %456 %459 = fmul float %455, %455 %460 = fadd float %458, %459 - %461 = call float @llvm.AMDGPU.rsq.f32(float %460) + %461 = call float @llvm.amdgcn.rsq.f32(float %460) %462 = fmul float %451, %461 %463 = fmul float %453, %461 %464 = fmul float %455, %461 @@ -1269,7 +1269,7 @@ %559 = fadd float %558, %557 %560 = fmul float %556, %556 %561 = fadd float %559, %560 - %562 = call float @llvm.AMDGPU.rsq.f32(float %561) + %562 = call float @llvm.amdgcn.rsq.f32(float %561) %563 = fmul float %562, %561 %564 = fsub float -0.000000e+00, %561 %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) Index: test/Transforms/InstCombine/amdgcn-intrinsics.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/amdgcn-intrinsics.ll @@ -0,0 +1,47 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone +declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone + +; CHECK-LABEL: @test_constant_fold_rcp_f32_1 +; CHECK-NEXT: ret float 1.000000e+00 +define float @test_constant_fold_rcp_f32_1() nounwind { + %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_1 +; CHECK-NEXT: ret double 1.000000e+00 +define double @test_constant_fold_rcp_f64_1() nounwind { + %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone + ret double %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f32_half +; CHECK-NEXT: ret float 2.000000e+00 +define float @test_constant_fold_rcp_f32_half() nounwind { + %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_half +; CHECK-NEXT: ret double 2.000000e+00 +define double @test_constant_fold_rcp_f64_half() nounwind { + %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone + ret double %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f32_43 +; CHECK-NEXT: call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) +define float @test_constant_fold_rcp_f32_43() nounwind { + %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_43 +; CHECK-NEXT: call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) +define double @test_constant_fold_rcp_f64_43() nounwind { + %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone + ret double %val +} + Index: test/Transforms/InstCombine/r600-intrinsics.ll =================================================================== --- test/Transforms/InstCombine/r600-intrinsics.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: opt -instcombine -S < %s | FileCheck %s - -declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone - -; CHECK-LABEL: @test_constant_fold_rcp_f32_1 -; CHECK-NEXT: ret float 1.000000e+00 -define float @test_constant_fold_rcp_f32_1() nounwind { - %val = call float @llvm.AMDGPU.rcp.f32(float 1.0) nounwind readnone - ret float %val -} - -; CHECK-LABEL: @test_constant_fold_rcp_f64_1 -; CHECK-NEXT: ret double 1.000000e+00 -define double @test_constant_fold_rcp_f64_1() nounwind { - %val = call double @llvm.AMDGPU.rcp.f64(double 1.0) nounwind readnone - ret double %val -} - -; CHECK-LABEL: @test_constant_fold_rcp_f32_half -; CHECK-NEXT: ret float 2.000000e+00 -define float @test_constant_fold_rcp_f32_half() nounwind { - %val = call float @llvm.AMDGPU.rcp.f32(float 0.5) nounwind readnone - ret float %val -} - -; CHECK-LABEL: @test_constant_fold_rcp_f64_half -; CHECK-NEXT: ret double 2.000000e+00 -define double @test_constant_fold_rcp_f64_half() nounwind { - %val = call double @llvm.AMDGPU.rcp.f64(double 0.5) nounwind readnone - ret double %val -} - -; CHECK-LABEL: @test_constant_fold_rcp_f32_43 -; CHECK-NEXT: call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) -define float @test_constant_fold_rcp_f32_43() nounwind { - %val = call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) nounwind readnone - ret float %val -} - -; CHECK-LABEL: @test_constant_fold_rcp_f64_43 -; CHECK-NEXT: call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) -define double @test_constant_fold_rcp_f64_43() nounwind { - %val = call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) nounwind readnone - ret double %val -} -