Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -100,8 +100,13 @@ multiclass VOP1Inst { - def _e32 : VOP1_Pseudo ; - def _e64 : VOP3_Pseudo .ret>; + // We only want to set this on the basic, non-SDWA or DPP forms. + defvar should_mov_imm = !eq(opName, "v_mov_b32"); + + let isMoveImm = should_mov_imm in { + def _e32 : VOP1_Pseudo ; + def _e64 : VOP3_Pseudo .ret>; + } foreach _ = BoolToList.ret in def _sdwa : VOP1_SDWA_Pseudo ; @@ -144,7 +149,7 @@ defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>; } -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; } // End isMoveImm = 1 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -494,8 +494,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -733,8 +733,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -972,8 +972,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -2045,8 +2045,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -2780,8 +2780,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3018,8 +3018,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3257,8 +3257,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3493,8 +3493,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3913,8 +3913,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -4336,8 +4336,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -4754,8 +4754,8 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -347,8 +347,8 @@ ; GFX1064-NEXT: v_readlane_b32 s12, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s12 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s13, v2, 31 ; GFX1064-NEXT: v_writelane_b32 v1, s12, 16 ; GFX1064-NEXT: v_readlane_b32 s12, v2, 63 @@ -406,8 +406,8 @@ ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032-NEXT: v_readlane_b32 s10, v2, 31 -; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 ; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 ; GFX1032-NEXT: v_writelane_b32 v1, s11, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 Index: llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -965,3 +965,33 @@ S_ENDPGM 0, implicit %3 ... + +# This used to incorrectly interpret V_MOV_B32_sdwa as being a move +# immediate, and interpreting the src0_modifiers field as a +# materialized immediate. + +--- +# GCN-LABEL: name: no_fold_sdwa_mov_imm +# GCN: %2:vgpr_32 = V_MOV_B32_sdwa 0, %0, 0, 5, 2, 4, implicit $exec, implicit %0(tied-def 0) +# GCN-NEXT: [[SHIFT:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec +# GCN-NEXT: S_ENDPGM 0, implicit [[SHIFT]] + +name: no_fold_sdwa_mov_imm +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + %2:vgpr_32 = V_MOV_B32_sdwa 0, %0:vgpr_32, 0, 5, 2, 4, implicit $exec, implicit %0:vgpr_32(tied-def 0) + %3:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + %4:vgpr_32 = V_LSHRREV_B32_e64 %3:vgpr_32, %2:vgpr_32, implicit $exec + S_ENDPGM 0, implicit %4 + +...