diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -341,22 +341,27 @@ defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile, fshr>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile, int_amdgcn_alignbyte>; -let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does -defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile, AMDGPUfmin3>; -defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile, AMDGPUsmin3>; -defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile, AMDGPUumin3>; -defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile, AMDGPUfmax3>; -defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile, AMDGPUsmax3>; -defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile, AMDGPUumax3>; -defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile, AMDGPUfmed3>; -defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile, AMDGPUsmed3>; -defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile, AMDGPUumed3>; +// XXX - No FPException seems suspect but manual doesn't say it does +let mayRaiseFPException = 0 in { + let isCommutable = 1 in { + defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile, AMDGPUsmin3>; + defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile, AMDGPUumin3>; + defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile, AMDGPUsmax3>; + defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile, AMDGPUumax3>; + defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile, AMDGPUsmed3>; + defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile, AMDGPUumed3>; + } // End isCommutable = 1 + defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile, AMDGPUfmin3>; + defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile, AMDGPUfmax3>; + defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile, AMDGPUfmed3>; } // End mayRaiseFPException = 0 -defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile>; -defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile>; -defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile>; -defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile>; +let isCommutable = 1 in { + defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile>; + defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile>; + defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile>; + defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile>; +} // End isCommutable = 1 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; @@ -596,16 +601,16 @@ } let SubtargetPredicate = isGFX9Plus in { -defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; -defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile>; -defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile>; -defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile>; -defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; -defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile>; -defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile>; - -defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile>; - +let isCommutable = 1 in { + defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile>; + defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile>; + defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile>; + defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile>; + defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile>; + defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile>; +} // End isCommutable = 1 +// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this +// to the new src0. defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile, AMDGPUfmed3>; defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>; defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>; @@ -627,8 +632,11 @@ defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile>; defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile>; -defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile>; defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile>; +defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; +defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile>; +defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; + class ThreeOp_i32_Pats : GCNPat < @@ -707,7 +715,9 @@ let SubtargetPredicate = isGFX10Plus in { - defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile>; + let isCommutable = 1 in { + defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile>; + } // End isCommutable = 1 def : ThreeOp_i32_Pats; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll @@ -112,7 +112,7 @@ ; ; GFX10-LABEL: add_shl_vgpr_const_inline_const: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_lshl_u32 v0, v0, 0x3f4, 9 +; GFX10-NEXT: v_add_lshl_u32 v0, 0x3f4, v0, 9 ; GFX10-NEXT: ; return to shader part epilog %x = add i32 %a, 1012 %result = shl i32 %x, 9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -642,7 +642,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b ret <2 x half> %fdiv @@ -701,7 +701,7 @@ ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b ret <2 x half> %fdiv @@ -854,7 +854,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -999,7 +999,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x ret <2 x half> %fdiv @@ -1144,7 +1144,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x ret <2 x half> %fdiv @@ -1195,7 +1195,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x half> , %x ret <2 x half> %fdiv @@ -1308,7 +1308,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x, !fpmath !0 ret <2 x half> %fdiv @@ -1367,7 +1367,7 @@ ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -1520,7 +1520,7 @@ ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -1579,7 +1579,7 @@ ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -218,7 +218,7 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -307,7 +307,7 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -397,7 +397,7 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -495,7 +495,7 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3555,7 +3555,7 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 ; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3455,7 +3455,7 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 ; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -469,7 +469,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -596,7 +596,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1243,7 +1243,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1370,7 +1370,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -44,7 +44,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -210,7 +210,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -376,7 +376,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -428,7 +428,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -683,7 +683,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -729,7 +729,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll @@ -72,7 +72,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -121,7 +121,7 @@ ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 @@ -164,7 +164,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -213,7 +213,7 @@ ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 @@ -256,7 +256,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -305,7 +305,7 @@ ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 @@ -348,7 +348,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -394,7 +394,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -440,7 +440,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -486,7 +486,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -532,7 +532,7 @@ ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -272,7 +272,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rndne_f16_e32 v1, v0 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) ret <2 x half> %roundeven @@ -341,7 +341,7 @@ ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX10-NEXT: v_rndne_f16_e32 v1, v0 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) diff --git a/llvm/test/CodeGen/AMDGPU/commute-vop3.mir b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir @@ -0,0 +1,57 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=machine-cse -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -run-pass=machine-cse -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: commute_vop3 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-LABEL: name: commute_vop3 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_XOR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR3_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: [[V_MED3_F16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MED3_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MAX3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: [[V_SAD_HI_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX9: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; GFX10-LABEL: name: commute_vop3 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_XOR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR3_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: [[V_MED3_F16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX10: [[V_MED3_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MED3_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GFX10: [[V_MAX3_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX3_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: [[V_SAD_HI_U8_e64_:%[0-9]+]]:vgpr_32 = V_SAD_HI_U8_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = V_XOR3_B32_e64 %0, %1, %2, implicit $exec + %4:vgpr_32 = V_XOR3_B32_e64 %1, %0, %2, implicit $exec + ; Insts with MayRaiseFPException do not get CSE + %5:vgpr_32 = V_MED3_F16_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %6:vgpr_32 = V_MED3_F16_e64 0, %1, 0, %0, 0, %2, 0, 0, implicit $mode, implicit $exec + + %7:vgpr_32 = V_MAX3_I32_e64 %0, %1, %2, implicit $exec + %8:vgpr_32 = V_MAX3_I32_e64 %1, %0, %2, implicit $exec + %11:vgpr_32 = V_SAD_HI_U8_e64 %0, %1, %2, 0, implicit $exec + %12:vgpr_32 = V_SAD_HI_U8_e64 %1, %0, %2, 0, implicit $exec + %13:vgpr_32 = V_XAD_U32_e64 %0, %1, 0, implicit $exec + %14:vgpr_32 = V_XAD_U32_e64 %1, %0, 0, implicit $exec + ; Sub should not be commuted + %15:vgpr_32 = V_SUB_I32_e64 %0, %1, 0, implicit $exec + %16:vgpr_32 = V_SUB_I32_e64 %1, %0, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -171,7 +171,7 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 1 @@ -286,7 +286,7 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -209,7 +209,7 @@ ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -219,7 +219,7 @@ ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 @@ -348,7 +348,7 @@ ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4: @@ -358,7 +358,7 @@ ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4