Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -698,7 +698,23 @@ ValSrc = Src0; } - if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) + if (!Imm) + break; + + SdwaSel Msk; + if (*Imm == 0x0ffff) + Msk = WORD_0; + else if (*Imm == 0x0ffff0000 || *Imm == -65536) + Msk = WORD_1; + else if (*Imm == 0x0ff) + Msk = BYTE_0; + else if (*Imm == 0x0ff00) + Msk = BYTE_1; + else if (*Imm == 0x0ff0000) + Msk = BYTE_2; + else if (*Imm == 0x0ff000000 || *Imm == -16777216) + Msk = BYTE_3; + else break; MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); @@ -707,8 +723,7 @@ TRI->isPhysicalRegister(Dst->getReg())) break; - return make_unique( - ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); + return make_unique(ValSrc, Dst, Msk); } case AMDGPU::V_OR_B32_e32: Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -125,10 +125,10 @@ ; VI: flat_load_dword ; VI-NOT: v_add_u16 -; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000, ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} ; VI-NOT: v_add_u16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -260,7 +260,8 @@ ; GFX9: v_pk_add_u16 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_add_u16_sdwa +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + ; VI: v_add_u16_e32 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -209,12 +209,15 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_0: ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]] +; CI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]] ; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}} ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]] + +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -231,14 +234,16 @@ ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GCN-DAG: s_load_dword [[ELT0:s[0-9]+]] -; CIVI-DAG: s_lshr_b32 [[ELT0_SHIFT:s[0-9]+]], [[ELT0]], 16 -; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] +; CI-DAG: s_lshr_b32 [[ELT0_SHIFT:s[0-9]+]], [[ELT0]], 16 +; CI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000{{$}} ; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]] ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -256,12 +261,14 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_0_inlineimm: ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] +; CI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -323,13 +330,15 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_0: ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]] +; CI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]] ; GFX9-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}} ; GFX9-DAG: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -345,8 +354,10 @@ ; GCN-LABEL: {{^}}v_insertelement_v2f16_0_inlineimm: ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] +; CI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] + +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53 @@ -480,8 +491,10 @@ ; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]] ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} -; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]] -; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]] +; VI: v_mov_b32_e32 [[VAL_MASKED1:v[0-9]+]], [[VAL_MASKED]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]] +; CI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]] +; VI: v_or_b32_sdwa v[[INS_LO:[0-9]+]], [[VAL_MASKED1]], v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { @@ -534,8 +547,11 @@ ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} -; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] -; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] + +; VI: v_mov_b32_e32 [[VAL_MASKED1:v[0-9]+]], [[VAL_MASKED]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] +; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] +; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], [[VAL_MASKED1]], v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { @@ -588,8 +604,10 @@ ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}} -; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] -; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] +; VI: v_mov_b32_e32 [[VAL_MASKED1:v[0-9]+]], [[VAL_MASKED]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] +; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]] +; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], [[VAL_MASKED1]], v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -268,7 +268,7 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort v{{[0-9]+}} -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -287,7 +287,7 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort v{{[0-9]+}} -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -310,6 +310,7 @@ ; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 ; VI: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] ; VI: flat_store_dword v[0:1], [[RES]] + define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -329,7 +330,7 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_sbyte v{{[0-9]+}} -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { entry: Index: test/CodeGen/AMDGPU/sdwa-andops.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-andops.mir @@ -0,0 +1,58 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefixes=VI,GCN %s + +# GCN-LABEL: {{^}}name: and_sdwa_i32 + +# GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123 +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec +# VI: [[VMOV:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[SMOV]], implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec + +# GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123 +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[SMOV]], 1, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec +# VI: [[VMOV:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[SMOV]], implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[VMOV]], 1, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec + +# GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123 +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[SMOV]], 1, %{{[0-9]+}}, 0, 6, 0, 6, 3, implicit $exec +# VI: [[VMOV:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[SMOV]], implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[VMOV]], 1, %{{[0-9]+}}, 0, 6, 0, 6, 3, implicit $exec + +# GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123 +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 3, implicit $exec +# VI: [[VMOV:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[SMOV]], implicit $exec +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 3, implicit $exec + +--- +name: and_sdwa_i32 +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %10:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %11:sreg_32_xm0 = S_MOV_B32 123 + %12:vgpr_32 = V_LSHRREV_B32_e64 16, %10:vgpr_32, implicit $exec + %13:vgpr_32 = V_AND_B32_e32 %11:sreg_32_xm0, killed %12:vgpr_32, implicit-def $vcc, implicit $exec + FLAT_STORE_DWORD %0, %13:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + + %20:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %21:sreg_32_xm0 = S_MOV_B32 123 + %22:vgpr_32 = V_ASHRREV_I32_e64 16, %20:vgpr_32, implicit $exec + %23:vgpr_32 = V_AND_B32_e32 %21:sreg_32_xm0, killed %22:vgpr_32, implicit-def $vcc, implicit $exec + FLAT_STORE_DWORD %0, %23:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + + %30:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %31:sreg_32_xm0 = S_MOV_B32 123 + %32:vgpr_32 = V_ASHRREV_I32_e64 24, %30:vgpr_32, implicit $exec + %33:vgpr_32 = V_AND_B32_e32 %31:sreg_32_xm0, killed %32:vgpr_32, implicit-def $vcc, implicit $exec + FLAT_STORE_DWORD %0, %33:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + + %40:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %41:sreg_32_xm0 = S_MOV_B32 123 + %42:vgpr_32 = V_LSHRREV_B32_e64 24, %40:vgpr_32, implicit $exec + %43:vgpr_32 = V_AND_B32_e32 %41:sreg_32_xm0, killed %42:vgpr_32, implicit-def $vcc, implicit $exec + FLAT_STORE_DWORD %0, %43:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + Index: test/CodeGen/AMDGPU/sdwa-ors.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-ors.mir @@ -0,0 +1,66 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefixes=GFX9,GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefixes=GFX9,GCN %s + +# GCN-LABEL: {{^}}name: sdwa_test +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 2, 6, implicit $exec +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 1, 6, implicit $exec +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 0, 6, implicit $exec +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 3, 6, implicit $exec +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 5, 6, implicit $exec +# GFX9: V_OR_B32_sdwa 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 6, 0, 4, 6, implicit $exec +# GFX9-NOT: V_OR_B32_sdwa + +--- +name: sdwa_test +tracksRegLiveness: true +registers: + - { id: 1, class: sgpr_64, preferred-register: '' } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + %2:vreg_64 = COPY $sgpr0_sgpr1 + %3:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %4:sreg_32_xm0 = S_MOV_B32 16711680 + %5:vgpr_32 = V_AND_B32_e64 %3, killed %4, implicit $exec + %6:vgpr_32 = V_OR_B32_e64 killed %5, killed %3, implicit $exec + GLOBAL_STORE_DWORD %2, %6, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %13:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %14:sreg_32_xm0 = S_MOV_B32 65280 + %15:vgpr_32 = V_AND_B32_e64 %13, killed %14, implicit $exec + %16:vgpr_32 = V_OR_B32_e64 killed %15, killed %13, implicit $exec + GLOBAL_STORE_DWORD %2, %16, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %33:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %34:sreg_32_xm0 = S_MOV_B32 255 + %35:vgpr_32 = V_AND_B32_e64 %33, killed %34, implicit $exec + %36:vgpr_32 = V_OR_B32_e64 killed %35, killed %33, implicit $exec + GLOBAL_STORE_DWORD %2, %36, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %43:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %44:sreg_32_xm0 = S_MOV_B32 4278190080 + %45:vgpr_32 = V_AND_B32_e64 %43, killed %44, implicit $exec + %46:vgpr_32 = V_OR_B32_e64 killed %45, killed %43, implicit $exec + GLOBAL_STORE_DWORD %2, %46, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %53:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %54:sreg_32_xm0 = S_MOV_B32 4294901760 + %55:vgpr_32 = V_AND_B32_e64 %53, killed %54, implicit $exec + %56:vgpr_32 = V_OR_B32_e64 killed %55, killed %53, implicit $exec + GLOBAL_STORE_DWORD %2, %56, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %63:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %64:sreg_32_xm0 = S_MOV_B32 65535 + %65:vgpr_32 = V_AND_B32_e64 %63, killed %64, implicit $exec + %66:vgpr_32 = V_OR_B32_e64 killed %65, killed %63, implicit $exec + GLOBAL_STORE_DWORD %2, %66, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + + %73:vgpr_32 = GLOBAL_LOAD_DWORD %2, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %74:sreg_32_xm0 = S_MOV_B32 65536 + %75:vgpr_32 = V_AND_B32_e64 %73, killed %74, implicit $exec + %76:vgpr_32 = V_OR_B32_e64 killed %75, killed %73, implicit $exec + GLOBAL_STORE_DWORD %2, %76, 0, 0, 0, implicit $exec :: (volatile store 4, addrspace 1) + Index: test/CodeGen/AMDGPU/sdwa-xors-ands-ors.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-xors-ands-ors.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s + +; GCN-LABEL: {{^}}sdwa_test: +; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 + +define amdgpu_kernel void @sdwa_test(i32 addrspace(1)* %arg, i32 addrspace(1)* %arg1) local_unnamed_addr #0 { +bb: + %tmp = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp2 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp3 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp4 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp5 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp6 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp7 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp8 = load volatile i32, i32 addrspace(1)* %arg1, align 4 + %tmp9 = load volatile i32, i32 addrspace(1)* %arg, align 4 + %tmp10 = xor i32 %tmp5, %tmp + %tmp11 = and i32 %tmp10, -16777216 + %tmp12 = xor i32 %tmp6, %tmp2 + %tmp13 = and i32 %tmp12, 16711680 + %tmp14 = or i32 %tmp13, %tmp11 + %tmp15 = xor i32 %tmp7, %tmp3 + %tmp16 = and i32 %tmp15, 65280 + %tmp17 = or i32 %tmp14, %tmp16 + %tmp18 = xor i32 %tmp8, %tmp4 + %tmp19 = and i32 %tmp18, 255 + %tmp20 = or i32 %tmp17, %tmp19 + store volatile i32 %tmp20, i32 addrspace(1)* %arg, align 4 + ret void +} + +attributes #0 = { norecurse nounwind "target-cpu"="gfx900" } Index: test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.v2i16.ll +++ test/CodeGen/AMDGPU/sub.v2i16.ll @@ -125,9 +125,8 @@ ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} ; VI: flat_load_dword [[LOAD:v[0-9]+]] -; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] ; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] -; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] +; VI: v_or_b32_sdwa v{{[0-9]+}}, [[ADD]], [[LOAD]] define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid