Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8210,18 +8210,20 @@ if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { - if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) { - uint64_t Amt = CAmt->getZExtValue(); - unsigned Size = VT.getScalarSizeInBits(); - - if (Amt < Size) { - SDLoc SL(N); - EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); - return DAG.getNode(ISD::SHL, SL, VT, Trunc, - DAG.getConstant(Amt, SL, AmtVT)); + SDValue Amt = N0.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(Amt, Known); + unsigned Size = VT.getScalarSizeInBits(); + if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { + SDLoc SL(N); + EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + if (AmtVT != Amt.getValueType()) { + Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); + AddToWorklist(Amt.getNode()); } + return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); } } Index: llvm/trunk/test/CodeGen/AMDGPU/alignbit-pat.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/alignbit-pat.ll +++ llvm/trunk/test/CodeGen/AMDGPU/alignbit-pat.ll @@ -16,23 +16,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat: -; GCN-DAG: s_load_dword s[[SHL:[0-9]+]] -; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: s_sub_i32 s[[SHR:[0-9]+]], 32, s[[SHL]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]] - -define amdgpu_kernel void @alignbit_shl_pat(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 31 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_v: ; GCN-DAG: load_dword v[[SHR:[0-9]+]], ; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} @@ -53,27 +36,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_v: -; GCN-DAG: load_dword v[[SHL:[0-9]+]], -; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_sub_i32_e32 v[[SHR:[0-9]+]], {{[^,]+}}, 32, v[[SHL]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]] - -define amdgpu_kernel void @alignbit_shl_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) { -bb: - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid - %tmp = load i64, i64 addrspace(1)* %gep1, align 8 - %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid - %amt = load i32, i32 addrspace(1)* %gep2, align 4 - %tmp3 = and i32 %amt, 31 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %gep2, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30: ; Negative test, wrong constant ; GCN: v_lshr_b64 @@ -90,22 +52,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and30: -; Negative test, wrong constant -; GCN: v_lshl_b64 -; GCN-NOT: v_alignbit_b32 - -define amdgpu_kernel void @alignbit_shl_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 30 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} - ; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63: ; Negative test, wrong constant ; GCN: v_lshr_b64 @@ -122,21 +68,6 @@ ret void } -; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and63: -; Negative test, wrong constant -; GCN: v_lshl_b64 -; GCN-NOT: v_alignbit_b32 - -define amdgpu_kernel void @alignbit_shl_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = load i64, i64 addrspace(1)* %arg, align 8 - %tmp3 = and i32 %arg2, 63 - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = shl i64 %tmp, %tmp4 - %tmp6 = trunc i64 %tmp5 to i32 - store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 - ret void -} declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } Index: llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -243,3 +243,77 @@ store volatile i64 %shl, i64 addrspace(1)* %in ret void } + +; GCN-LABEL: {{^}}trunc_shl_and31: +; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31 +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN-NOT: v_lshl_b64 +; GCN-NOT: v_lshlrev_b64 +define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 31 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_and30: +; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 30 +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN-NOT: v_lshl_b64 +; GCN-NOT: v_lshlrev_b64 +define amdgpu_kernel void @trunc_shl_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 30 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_wrong_and63: +; Negative test, wrong constant +; GCN: v_lshl_b64 +define amdgpu_kernel void @trunc_shl_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp3 = and i32 %arg2, 63 + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_no_and: +; Negative test, shift can be full 64 bit +; GCN: v_lshl_b64 +define amdgpu_kernel void @trunc_shl_no_and(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { +bb: + %tmp = load i64, i64 addrspace(1)* %arg, align 8 + %tmp4 = zext i32 %arg2 to i64 + %tmp5 = shl i64 %tmp, %tmp4 + %tmp6 = trunc i64 %tmp5 to i32 + store i32 %tmp6, i32 addrspace(1)* %arg1, align 4 + ret void +} + +; GCN-LABEL: {{^}}trunc_shl_vec_vec: +; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 6, v{{[0-9]+}} +; GCN-NOT: v_lshl_b64 +; GCN-NOT: v_lshlrev_b64 +define amdgpu_kernel void @trunc_shl_vec_vec(<4 x i64> addrspace(1)* %arg) { +bb: + %v = load <4 x i64>, <4 x i64> addrspace(1)* %arg, align 32 + %shl = shl <4 x i64> %v, + store <4 x i64> %shl, <4 x i64> addrspace(1)* %arg, align 32 + ret void +}