Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1374,12 +1374,12 @@ >; } // End isCommutable = 1 +let isCommutable = 1 in { + defm V_MAC_LEGACY_F32 : VOP2Inst , "V_MAC_LEGACY_F32", VOP_F32_F32_F32 >; -let isCommutable = 1 in { - defm V_MUL_LEGACY_F32 : VOP2Inst , "V_MUL_LEGACY_F32", VOP_F32_F32_F32, int_AMDGPU_mul >; @@ -1388,7 +1388,6 @@ VOP_F32_F32_F32, fmul >; - defm V_MUL_I32_I24 : VOP2Inst , "V_MUL_I32_I24", VOP_I32_I32_I32, AMDGPUmul_i24 >; @@ -1449,9 +1448,18 @@ defm V_BFM_B32 : VOP2Inst , "V_BFM_B32", VOP_I32_I32_I32, AMDGPUbfm>; + +let isCommutable = 1 in { defm V_MAC_F32 : VOP2Inst , "V_MAC_F32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + defm V_MADMK_F32 : VOP2Inst , "V_MADMK_F32", VOP_F32_F32_F32>; + +let isCommutable = 1 in { defm V_MADAK_F32 : VOP2Inst , "V_MADAK_F32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + + defm V_BCNT_U32_B32 : VOP2Inst , "V_BCNT_U32_B32", VOP_I32_I32_I32>; defm V_MBCNT_LO_U32_B32 : VOP2Inst , "V_MBCNT_LO_U32_B32", VOP_I32_I32_I32 @@ -1503,18 +1511,22 @@ // VOP3 Instructions //===----------------------------------------------------------------------===// +let isCommutable = 1 in { defm V_MAD_LEGACY_F32 : VOP3Inst , "V_MAD_LEGACY_F32", VOP_F32_F32_F32_F32 >; + defm V_MAD_F32 : VOP3Inst , "V_MAD_F32", VOP_F32_F32_F32_F32, fmad >; + defm V_MAD_I32_I24 : VOP3Inst , "V_MAD_I32_I24", VOP_I32_I32_I32_I32, AMDGPUmad_i24 >; defm V_MAD_U32_U24 : VOP3Inst , "V_MAD_U32_U24", VOP_I32_I32_I32_I32, AMDGPUmad_u24 >; +} // End isCommutable = 1 defm V_CUBEID_F32 : VOP3Inst , "V_CUBEID_F32", VOP_F32_F32_F32_F32 @@ -1537,12 +1549,16 @@ defm V_BFI_B32 : VOP3Inst , "V_BFI_B32", VOP_I32_I32_I32_I32, AMDGPUbfi >; + +let isCommutable = 1 in { defm V_FMA_F32 : VOP3Inst , "V_FMA_F32", VOP_F32_F32_F32_F32, fma >; defm V_FMA_F64 : VOP3Inst , "V_FMA_F64", VOP_F64_F64_F64_F64, fma >; +} // End isCommutable = 1 + //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; defm V_ALIGNBIT_B32 : VOP3Inst , "V_ALIGNBIT_B32", VOP_I32_I32_I32_I32 @@ -1628,12 +1644,15 @@ // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3b_64 , "V_DIV_SCALE_F64", []>; +let isCommutable = 1 in { defm V_DIV_FMAS_F32 : VOP3Inst , "V_DIV_FMAS_F32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; defm V_DIV_FMAS_F64 : VOP3Inst , "V_DIV_FMAS_F64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; +} // End isCommutable = 1 + //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; @@ -2848,6 +2867,8 @@ defm V_MQSAD_U32_U8 : VOP3Inst , "V_MQSAD_U32_U8", VOP_I32_I32_I32 >; + +let isCommutable = 1 in { defm V_MAD_U64_U32 : VOP3Inst , "V_MAD_U64_U32", VOP_I64_I32_I32_I64 >; @@ -2856,6 +2877,7 @@ defm V_MAD_I64_I32 : VOP3Inst , "V_MAD_I64_I32", VOP_I64_I32_I32_I64 >; +} // End isCommutable = 1 // Remaining instructions: // FLAT_* Index: test/CodeGen/R600/fma.ll =================================================================== --- test/CodeGen/R600/fma.ll +++ test/CodeGen/R600/fma.ll @@ -5,19 +5,21 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone -; FUNC-LABEL: {{^}}fma_f32: +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}fma_f32 ; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG: FMA {{\*? *}}[[RES]] define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { - %r0 = load float addrspace(1)* %in1 - %r1 = load float addrspace(1)* %in2 - %r2 = load float addrspace(1)* %in3 - %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void + %r0 = load float addrspace(1)* %in1 + %r1 = load float addrspace(1)* %in2 + %r2 = load float addrspace(1)* %in3 + %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void } ; FUNC-LABEL: {{^}}fma_v2f32: @@ -29,12 +31,12 @@ ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { - %r0 = load <2 x float> addrspace(1)* %in1 - %r1 = load <2 x float> addrspace(1)* %in2 - %r2 = load <2 x float> addrspace(1)* %in3 - %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) - store <2 x float> %r3, <2 x float> addrspace(1)* %out - ret void + %r0 = load <2 x float> addrspace(1)* %in1 + %r1 = load <2 x float> addrspace(1)* %in2 + %r2 = load <2 x float> addrspace(1)* %in3 + %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) + store <2 x float> %r3, <2 x float> addrspace(1)* %out + ret void } ; FUNC-LABEL: {{^}}fma_v4f32: @@ -50,10 +52,41 @@ ; EG-DAG: FMA {{\*? *}}[[RES]].W define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { - %r0 = load <4 x float> addrspace(1)* %in1 - %r1 = load <4 x float> addrspace(1)* %in2 - %r2 = load <4 x float> addrspace(1)* %in3 - %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) - store <4 x float> %r3, <4 x float> addrspace(1)* %out - ret void + %r0 = load <4 x float> addrspace(1)* %in1 + %r1 = load <4 x float> addrspace(1)* %in2 + %r2 = load <4 x float> addrspace(1)* %in3 + %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) + store <4 x float> %r3, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 +; SI: V_FMA_F32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} +define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %in.a.gep, align 4 + %b = load float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void +} + +; FUNC-LABEL: @fma_commute_mul_s_f32 +define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %in.a.gep, align 4 + %c = load float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float %b, float %c) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void } Index: test/CodeGen/R600/fmuladd.ll =================================================================== --- test/CodeGen/R600/fmuladd.ll +++ test/CodeGen/R600/fmuladd.ll @@ -116,7 +116,7 @@ ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 ; CHECK-DAG: BUFFER_LOAD_DWORD [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: BUFFER_LOAD_DWORD [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 -; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] +; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] ; CHECK: BUFFER_STORE_DWORD [[RESULT]] define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -158,7 +158,7 @@ ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 ; CHECK-DAG: BUFFER_LOAD_DWORD [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: BUFFER_LOAD_DWORD [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 -; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] +; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] ; CHECK: BUFFER_STORE_DWORD [[RESULT]] define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone Index: test/CodeGen/R600/llvm.AMDGPU.umad24.ll =================================================================== --- test/CodeGen/R600/llvm.AMDGPU.umad24.ll +++ test/CodeGen/R600/llvm.AMDGPU.umad24.ll @@ -5,6 +5,7 @@ ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}test_umad24: ; SI: V_MAD_U32_U24 @@ -17,3 +18,21 @@ ret void } +; FUNC-LABEL: {{^}}commute_umad24: +; SI-DAG: BUFFER_LOAD_DWORD [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: BUFFER_LOAD_DWORD [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 +; SI: V_MAD_U32_U24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]] +; SI: BUFFER_STORE_DWORD [[RESULT]] +define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid + %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid + %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1 + + %src0 = load i32 addrspace(1)* %src0.gep, align 4 + %src2 = load i32 addrspace(1)* %src2.gep, align 4 + %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out.gep, align 4 + ret void +} + Index: test/CodeGen/R600/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/R600/use-sgpr-multiple-times.ll +++ test/CodeGen/R600/use-sgpr-multiple-times.ll @@ -73,7 +73,7 @@ ; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: ; SI: S_LOAD_DWORD [[SGPR:s[0-9]+]] -; SI: V_FMA_F32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]] +; SI: V_FMA_F32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] ; SI: BUFFER_STORE_DWORD [[RESULT]] define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1