Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2857,11 +2857,6 @@ "the size of the current value type"); EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); - SDValue ResultLL = Result[0]; - SDValue ResultLH = Result[1]; - SDValue ResultHL = Result[2]; - SDValue ResultHH = Result[3]; - // After getting the multiplication result in 4 parts, we need to perform a // shift right by the amount of the scale to get the result in that scale. // @@ -2876,50 +2871,22 @@ // // |NVTSize-| // - // The resulting Lo and Hi will only need to be one of these 32-bit parts - // after shifting. - if (Scale < NVTSize) { - // If the scale is less than the size of the VT we expand to, the Hi and - // Lo of the result will be in the first 2 parts of the result after - // shifting right. This only requires shifting by the scale as far as the - // third part in the result (ResultHL). - SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy); - SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy); - Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt); - Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, - DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt)); - Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); - Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, - DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); - } else if (Scale == NVTSize) { - // If the scales are equal, Lo and Hi are ResultLH and ResultHL, - // respectively. Avoid shifting to prevent undefined behavior. - Lo = ResultLH; - Hi = ResultHL; - } else if (Scale < VTSize) { - // If the scale is instead less than the old VT size, but greater than or - // equal to the expanded VT size, the first part of the result (ResultLL) is - // no longer a part of Lo because it would be scaled out anyway. Instead we - // can start shifting right from the fourth part (ResultHH) to the second - // part (ResultLH), and ResultLH will be the new Lo. - SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy); - SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy); - Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); - Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, - DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); - Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt); - Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, - DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt)); - } else if (Scale == VTSize) { - assert( - !Signed && - "Only unsigned types can have a scale equal to the operand bit width"); - - Lo = ResultHL; - Hi = ResultHH; - } else - llvm_unreachable("Expected the scale to be less than or equal to the width " - "of the operands"); + // The resulting Lo and Hi would normally be in LL and LH after the shift. But + // to avoid unneccessary shifting of all 4 parts, we can adjust the shift + // amount and get Lo and Hi using two funnel shifts. Or for the special case + // when Scale is a multiple of NVTSize we can just pick the result without + // shifting. + uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed. + if (Scale % NVTSize) { + SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy); + Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0], + ShiftAmount); + Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1], + ShiftAmount); + } else { + Lo = Result[Part0]; + Hi = Result[Part0 + 1]; + } // Unless saturation is requested we are done. The result is in . if (!Saturating) @@ -2934,6 +2901,9 @@ // highest bit of HH determines saturation direction in the event of // saturation. + SDValue ResultHL = Result[2]; + SDValue ResultHH = Result[3]; + SDValue SatMax, SatMin; SDValue NVTZero = DAG.getConstant(0, dl, NVT); SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT); Index: llvm/trunk/test/CodeGen/RISCV/addcarry.ll =================================================================== --- llvm/trunk/test/CodeGen/RISCV/addcarry.ll +++ llvm/trunk/test/CodeGen/RISCV/addcarry.ll @@ -34,10 +34,10 @@ ; RISCV32-NEXT: mul a0, a0, a2 ; RISCV32-NEXT: srli a0, a0, 2 ; RISCV32-NEXT: slli a1, a6, 30 -; RISCV32-NEXT: or a0, a0, a1 +; RISCV32-NEXT: or a0, a1, a0 ; RISCV32-NEXT: srli a1, a6, 2 ; RISCV32-NEXT: slli a2, a5, 30 -; RISCV32-NEXT: or a1, a1, a2 +; RISCV32-NEXT: or a1, a2, a1 ; RISCV32-NEXT: ret %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2); ret i64 %tmp; Index: llvm/trunk/test/CodeGen/X86/smul_fix.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/smul_fix.ll +++ llvm/trunk/test/CodeGen/X86/smul_fix.ll @@ -374,26 +374,25 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %ebp, %esi ; X86-NEXT: cmovnsl %edx, %ecx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi Index: llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll +++ llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll @@ -60,7 +60,6 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax @@ -69,64 +68,61 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sbbl $0, %edi -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnsl %ecx, %edi -; X86-NEXT: cmovnsl %edx, %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %esi, %ebx +; X86-NEXT: cmovnsl %edx, %edi ; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl $0, %ebp +; X86-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebx, %esi ; X86-NEXT: cmovnsl %edi, %ebp -; X86-NEXT: cmovnsl %esi, %ecx -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: setg %bh -; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: seta %bl -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $30, %esi, %eax -; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload -; X86-NEXT: orb %bh, %bl -; X86-NEXT: testb %bl, %bl -; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: cmpl $-1, %ebp -; X86-NEXT: setl %bl +; X86-NEXT: testl %esi, %esi +; X86-NEXT: setg %bl ; X86-NEXT: sete %bh -; X86-NEXT: cmpl $-2, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: andb %bh, %cl -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: orb %bl, %cl -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: cmpl $1, %ebp +; X86-NEXT: seta %dl +; X86-NEXT: andb %bh, %dl +; X86-NEXT: orb %bl, %dl +; X86-NEXT: shrdl $2, %eax, %ecx +; X86-NEXT: shrdl $2, %ebp, %eax +; X86-NEXT: testb %dl, %dl +; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: setl %al +; X86-NEXT: sete %dl +; X86-NEXT: cmpl $-2, %ebp +; X86-NEXT: setb %ah +; X86-NEXT: andb %dl, %ah +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: orb %al, %ah +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: cmovel %edi, %edx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -688,44 +684,42 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %ebp, %esi ; X86-NEXT: cmovnsl %edx, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %ebx -; X86-NEXT: cmovnsl %ecx, %edi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl $1, %edi, %edx -; X86-NEXT: shrdl $31, %edi, %eax -; X86-NEXT: cmpl $1073741823, %ebx # imm = 0x3FFFFFFF +; X86-NEXT: cmovnsl %esi, %edi +; X86-NEXT: cmovnsl %ecx, %edx +; X86-NEXT: shrdl $31, %edx, %eax +; X86-NEXT: shrdl $31, %edi, %edx +; X86-NEXT: cmpl $1073741823, %edi # imm = 0x3FFFFFFF ; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: cmovgl %ecx, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovgl %ecx, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $-1073741824, %ebx # imm = 0xC0000000 +; X86-NEXT: cmpl $-1073741824, %edi # imm = 0xC0000000 ; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-NEXT: cmovll %ecx, %edx Index: llvm/trunk/test/CodeGen/X86/umul_fix.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/umul_fix.ll +++ llvm/trunk/test/CodeGen/X86/umul_fix.ll @@ -60,9 +60,10 @@ ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %esi, %edx -; X86-NEXT: shldl $30, %eax, %edx +; X86-NEXT: addl %edx, %esi +; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: shldl $30, %ecx, %eax +; X86-NEXT: movl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -318,23 +319,22 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shldl $1, %edx, %ecx ; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx