diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13503,6 +13503,16 @@ return N0.getOperand(0); } + // fold (truncate (build pair x, y)) -> (truncate x) or x + if (N0.getOpcode() == ISD::BUILD_PAIR) { + // if we match the low element of the pair, just return it. + if (N0.getOperand(0).getValueType() == VT) + return N0.getOperand(0); + // otherwise, if the low part is still too large, apply the truncate. + if (N0.getOperand(0).getValueType().bitsGT(VT)) + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); + } + // Try to narrow a truncate-of-sext_in_reg to the destination type: // trunc (sign_ext_inreg X, iM) to iN --> sign_ext_inreg (trunc X to iN), iM if (!LegalTypes && N0.getOpcode() == ISD::SIGN_EXTEND_INREG && diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -8,20 +8,20 @@ ; SI-LABEL: umulo_i64_v_v: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_hi_u32 v4, v1, v2 -; SI-NEXT: v_mul_lo_u32 v5, v1, v2 -; SI-NEXT: v_mul_hi_u32 v6, v0, v3 -; SI-NEXT: v_mul_lo_u32 v7, v0, v3 -; SI-NEXT: v_mul_hi_u32 v8, v0, v2 -; SI-NEXT: v_mul_hi_u32 v9, v1, v3 +; SI-NEXT: v_mul_hi_u32 v4, v1, v3 +; SI-NEXT: v_mul_hi_u32 v5, v1, v2 +; SI-NEXT: v_mul_lo_u32 v6, v1, v2 +; SI-NEXT: v_mul_hi_u32 v7, v0, v3 +; SI-NEXT: v_mul_lo_u32 v8, v0, v3 +; SI-NEXT: v_mul_hi_u32 v9, v0, v2 ; SI-NEXT: v_mul_lo_u32 v3, v1, v3 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v7 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, v1, v5 -; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5 -; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v1, vcc, v9, v8 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, v1, v6 +; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 +; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] @@ -34,14 +34,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v2 ; GFX9-NEXT: v_mul_lo_u32 v5, v5, v3 @@ -114,21 +114,21 @@ ; SI-LABEL: smulo_i64_v_v: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_hi_i32 v7, v1, v3 ; SI-NEXT: v_mul_hi_u32 v6, v1, v2 ; SI-NEXT: v_mul_lo_u32 v5, v1, v2 -; SI-NEXT: v_mul_hi_u32 v7, v0, v3 -; SI-NEXT: v_mul_lo_u32 v8, v0, v3 -; SI-NEXT: v_mul_hi_u32 v9, v0, v2 -; SI-NEXT: v_mul_hi_i32 v10, v1, v3 +; SI-NEXT: v_mul_hi_u32 v8, v0, v3 +; SI-NEXT: v_mul_lo_u32 v9, v0, v3 +; SI-NEXT: v_mul_hi_u32 v10, v0, v2 ; SI-NEXT: v_mul_lo_u32 v11, v1, v3 ; SI-NEXT: v_mul_lo_u32 v4, v0, v2 -; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, v8, v5 -; SI-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; SI-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, v9, v5 +; SI-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5 +; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v6, vcc ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc ; SI-NEXT: v_mov_b32_e32 v7, v6 ; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -154,14 +154,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, v1 ; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -455,10 +455,10 @@ ; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 1, v1 +; CI-NEXT: v_and_b32_e32 v1, 1, v1 +; CI-NEXT: v_mul_lo_u32 v3, v1, v2 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] -; CI-NEXT: v_mul_lo_u32 v2, v3, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -506,11 +506,10 @@ ; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v6, v0 -; CI-NEXT: v_and_b32_e32 v3, 1, v3 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] -; CI-NEXT: v_mul_lo_u32 v2, v6, v3 -; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_and_b32_e32 v1, 1, v3 +; CI-NEXT: v_mul_lo_u32 v3, v0, v1 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] +; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -886,20 +885,19 @@ ; CI-LABEL: mad_i48_i48: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v6, v1 -; CI-NEXT: v_mov_b32_e32 v7, v0 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] -; CI-NEXT: v_mul_lo_u32 v2, v6, v2 -; CI-NEXT: v_mul_lo_u32 v3, v7, v3 +; CI-NEXT: v_mov_b32_e32 v6, v0 +; CI-NEXT: v_mul_lo_u32 v7, v1, v2 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] +; CI-NEXT: v_mul_lo_u32 v2, v6, v3 +; CI-NEXT: v_add_i32_e32 v1, vcc, v7, v1 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_i48_i48: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_lo_u32 v3, v0, v3 ; SI-NEXT: v_mul_hi_u32 v6, v0, v2 +; SI-NEXT: v_mul_lo_u32 v3, v0, v3 ; SI-NEXT: v_mul_lo_u32 v1, v1, v2 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v3 @@ -911,25 +909,22 @@ ; GFX9-LABEL: mad_i48_i48: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v3, v7, v3 -; GFX9-NEXT: v_mul_lo_u32 v2, v6, v2 -; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] +; GFX9-NEXT: v_add3_u32 v1, v6, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: mad_i48_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5] -; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3 -; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mul_lo_u32 v7, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v3, v6, v3 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] +; GFX11-NEXT: v_add3_u32 v1, v7, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %m = mul i48 %arg0, %arg1 %a = add i48 %m, %arg2 diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -245,9 +245,9 @@ ; undef vector resulting in a combiner loop. ; GCN-LABEL: {{^}}inf_loop_undef_vector: ; GCN: s_waitcnt -; GCN-NEXT: v_mad_u64_u32 ; GCN-NEXT: v_mul_lo_u32 ; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_mad_u64_u32 ; GCN-NEXT: v_add3_u32 ; GCN-NEXT: global_store_dwordx2 define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) { diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -5,13 +5,13 @@ ; GCN-LABEL: v_shl_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 -; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4 -; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 64, v4 +; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v5 +; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GCN-NEXT: v_or_b32_e32 v7, v5, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 -; GCN-NEXT: v_or_b32_e32 v8, v6, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v5 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc @@ -30,13 +30,13 @@ ; GCN-LABEL: v_lshr_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 -; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 -; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 64, v4 +; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v5 +; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GCN-NEXT: v_or_b32_e32 v7, v5, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 -; GCN-NEXT: v_or_b32_e32 v8, v6, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v5 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc @@ -56,23 +56,25 @@ ; GCN-LABEL: v_ashr_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 -; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 +; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], 64, v4 ; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GCN-NEXT: v_or_b32_e32 v7, v5, v7 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 -; GCN-NEXT: v_or_b32_e32 v8, v6, v8 -; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v5 +; GCN-NEXT: v_lshr_b64 v[9:10], v[0:1], v4 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-NEXT: v_or_b32_e32 v8, v10, v8 +; GCN-NEXT: v_subrev_i32_e64 v10, s[4:5], 64, v4 +; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], v4 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_or_b32_e32 v7, v9, v7 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v6 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, %rhs ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -10,8 +10,8 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) @@ -20,9 +20,9 @@ ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -30,9 +30,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_write_b32 v0, v1 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX10-NEXT: ds_write_b32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -40,9 +40,9 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v1 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 ; GFX11-NEXT: ds_store_b16 v0, v2 offset:4 -; GFX11-NEXT: ds_store_b32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] store i56 %arg, i56 addrspace(3)* %ptr, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -431,29 +431,27 @@ ; GFX9-O0-LABEL: strict_wwm_called_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[34:35], v4, v5 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[34:35], v0, v1, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[34:35], v4, v6 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[34:35], v2, v5, s[34:35] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s34, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s34, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: v_mul_lo_u32 v2, v2, v3 @@ -499,10 +497,10 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v0 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v1, vcc -; GFX9-O3-NEXT: v_mul_lo_u32 v4, v3, v0 -; GFX9-O3-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX9-O3-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX9-O3-NEXT: v_mul_lo_u32 v5, v3, v0 ; GFX9-O3-NEXT: v_mad_u64_u32 v[0:1], s[34:35], v2, v0, 0 -; GFX9-O3-NEXT: v_add3_u32 v1, v1, v5, v4 +; GFX9-O3-NEXT: v_add3_u32 v1, v1, v4, v5 ; GFX9-O3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-O3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/Hexagon/isel-simplify-trunc-buildpair.ll b/llvm/test/CodeGen/Hexagon/isel-simplify-trunc-buildpair.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel-simplify-trunc-buildpair.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define <32 x i32> @f0(<32 x i32> %a0) #0 { +; CHECK-LABEL: f0: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %b0 +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0.w = vasl(v0.w,r0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +b0: + %v0 = call <32 x i32> @llvm.fshr.v32i32(<32 x i32> %a0, <32 x i32> poison, <32 x i32> ) + %v1 = shufflevector <32 x i32> %v0, <32 x i32> poison, <64 x i32> + %v2 = bitcast <64 x i32> %v1 to <32 x i64> + %v3 = trunc <32 x i64> %v2 to <32 x i32> + ret <32 x i32> %v3 +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare <32 x i32> @llvm.fshr.v32i32(<32 x i32>, <32 x i32>, <32 x i32>) #1 + +attributes #0 = { "target-features"="+v62,+hvxv62,+hvx-length128b,-packets" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll --- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll +++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll @@ -308,8 +308,8 @@ ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NOBMI2-NEXT: movl %eax, %ebx ; X32-NOBMI2-NEXT: addl $32, %ebx ; X32-NOBMI2-NEXT: adcl $0, %edi @@ -339,8 +339,8 @@ ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-BMI2-NEXT: movl %ebx, %edi ; X32-BMI2-NEXT: addl $32, %edi ; X32-BMI2-NEXT: adcl $0, %esi diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll --- a/llvm/test/CodeGen/X86/combine-bswap.ll +++ b/llvm/test/CodeGen/X86/combine-bswap.ll @@ -220,7 +220,7 @@ define i64 @test_bswap64_shift48(i64 %a0) { ; X86-LABEL: test_bswap64_shift48: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll @@ -10,7 +10,7 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ebx, %ebx diff --git a/llvm/test/CodeGen/X86/test-shrink.ll b/llvm/test/CodeGen/X86/test-shrink.ll --- a/llvm/test/CodeGen/X86/test-shrink.ll +++ b/llvm/test/CodeGen/X86/test-shrink.ll @@ -822,8 +822,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: movl $32768, %eax # imm = 0x8000 -; CHECK-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-X86-NEXT: testw %ax, %ax ; CHECK-X86-NEXT: js .LBB18_2 ; CHECK-X86-NEXT: # %bb.1: # %yes @@ -867,7 +867,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign_minsize: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: testw $-32768, {{[0-9]+}}(%esp) # imm = 0x8000 +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: testw $-32768, %ax # imm = 0x8000 ; CHECK-X86-NEXT: js .LBB19_2 ; CHECK-X86-NEXT: # %bb.1: # %yes ; CHECK-X86-NEXT: calll bar@PLT