Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -206,7 +206,7 @@ []>; def int_amdgcn_buffer_load_format : Intrinsic < - [llvm_v4f32_ty], + [llvm_anyfloat_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -2108,45 +2108,52 @@ //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// -def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc), - (BUFFER_LOAD_FORMAT_XYZW_OFFSET $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; -def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, - i16:$offset), - imm:$glc, imm:$slc), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; +multiclass MUBUF_LoadIntrinsicPat { + def : Pat< + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; -def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc), - (BUFFER_LOAD_FORMAT_XYZW_OFFEN $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; + def : Pat< + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; -def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, - i16:$offset, - i32:$voffset), - imm:$glc, imm:$slc), - (BUFFER_LOAD_FORMAT_XYZW_BOTHEN - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) ->; + def : Pat< + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; def : Pat< (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0, Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -8,9 +8,9 @@ ;CHECK: s_waitcnt define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -22,7 +22,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret <4 x float> %data } @@ -35,9 +35,9 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0) - %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0) + %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) %d.3 = fadd <4 x float> %d.0, %d.1 %data = fadd <4 x float> %d.2, %d.3 ret <4 x float> %data @@ -51,8 +51,8 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 { main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) %data = fadd <4 x float> %d.0, %d.1 ret <4 x float> %data } @@ -62,7 +62,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data } @@ -71,7 +71,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data } @@ -81,7 +81,7 @@ define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { main_body: %ofs = add i32 %1, 58 - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data } @@ -90,7 +90,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data } @@ -100,11 +100,31 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data } -declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #1 +;CHECK-LABEL: {{^}}buffer_load_x: +;CHECK: buffer_load_format_x v0, s[0:3], 0 +;CHECK: s_waitcnt +define float @buffer_load_x(<4 x i32> inreg %rsrc) #0 { +main_body: + %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_xy: +;CHECK: buffer_load_format_xy v[0:1], s[0:3], 0 +;CHECK: s_waitcnt +define <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) #0 { +main_body: + %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind readonly } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -65,13 +65,13 @@ define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) ret void } declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind }