This is an archive of the discontinued LLVM Phabricator instance.

[X86] Prefer rotate by 1 over rotate by imm
ClosedPublic

Authored by zvi on Oct 8 2016, 12:08 PM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon
delena
craig.topper
igorb

Commits

rG2a21f125bd07: [X86] Prefer rotate by 1 over rotate by imm
rL283758: [X86] Prefer rotate by 1 over rotate by imm

Summary

Rotate by 1 is translated to 1 micro-op, while rotate with imm8 is translated to 2 micro-ops.

Fixes pr30644.

Diff Detail

Repository: rL LLVM

Event Timeline

zvi updated this revision to Diff 74042.Oct 8 2016, 12:08 PM

zvi retitled this revision from to [X86] Prefer rotate by 1 over rotate by imm.

zvi updated this object.

zvi added reviewers: delena, igorb, craig.topper, spatel, RKSimon.

zvi set the repository for this revision to rL LLVM.

craig.topper added inline comments.Oct 8 2016, 12:33 PM

lib/Target/X86/X86InstrShiftRotate.td
612	Does rotr by 1 never get emitted or should we really have two patterns here?

zvi added inline comments.Oct 8 2016, 12:40 PM

lib/Target/X86/X86InstrShiftRotate.td
612	Note that for 8-bit operands: rotate right by 1 is same as rotate left by 7. It seems that the canonical form prefers the latter while the target prefers the former. Does this answer your question, Craig?

RKSimon added inline comments.Oct 8 2016, 12:43 PM

test/CodeGen/X86/rotate.ll
2	We have a 64-bit triple and a 32-bit arch - this needs fixing and if possible test on both 32 and 64 bit targets. If possible add i64 tests as well?

zvi added inline comments.Oct 8 2016, 12:46 PM

test/CodeGen/X86/rotate.ll
2	Good catch! Will add the i64 tests.

Rebased on top of trunk after r283695 was comitted.

zvi marked 2 inline comments as done.Oct 9 2016, 6:29 AM

zvi added inline comments.

test/CodeGen/X86/rotate.ll
2	Fixed in r283695

Please can you confirm that the _lrotl/_lrotr intrinsics (I think they are only on clang-cl) can't create the removed rotr #1 pattern

In D25399#565597, @RKSimon wrote:

Please can you confirm that the _lrotl/_lrotr intrinsics (I think they are only on clang-cl) can't create the removed rotr #1 pattern

Seems like for 64-bit operand intrinsics we are not matching any rotate instructions:

unsigned long ltestl(unsigned long _Value) {
   return _lrotl(_Value, 1);
}
      leaq    (%rdi,%rdi), %rax
      shrq    $31, %rdi
      orq     %rdi, %rax
      retq


unsigned long ltestr(unsigned long _Value) {
  return _lrotr(_Value, 1);
}
      movq    %rdi, %rax
      shrq    %rax
      shlq    $31, %rdi
      orq     %rax, %rdi
      movq    %rdi, %rax
      retq

But the 32-bit operand flavors are matching rotate instructions (these are withour this patch):

unsigned long testl(unsigned long _Value) {
  return _rotl(_Value, 1);
}
      roll    %edi
      movq    %rdi, %rax
      retq


unsigned long testr(unsigned long _Value) {
  return _rotr(_Value, 1);
}
      roll    $31, %edi
      movq    %rdi, %rax
      retq

The 64-bit operand issue seems like out of scope of this patch, since the matching to ISD::ROTR/ROTL nodes happens in the target-independent phase of DagCombine. I will open a Bugzilla for this and will investigate it further.

Actually, the relevant 64-bit operand intrinsics are _rotl64 and _rotr64 for which the generated code without this patch is:

test_rotl64(unsigned long):                       # @test_rotl64(unsigned long)
      rolq    %rdi
      movq    %rdi, %rax
      retq

test_rotr64(unsigned long):                       # @test_rotr64(unsigned long)
      rolq    $63, %rdi
      movq    %rdi, %rax
      retq

Thanks for checking, LGTM

This revision is now accepted and ready to land.Oct 10 2016, 7:44 AM

Committed in rL283758.

Revision Contents

Path

Size

lib/

Target/

X86/

X86InstrShiftRotate.td

8 lines

test/

CodeGen/

X86/

rotate.ll

6 lines

Diff 74042

lib/Target/X86/X86InstrShiftRotate.td

Show First 20 Lines • Show All 603 Lines • ▼ Show 20 Lines	def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),		(ins GR64:$src1, u8imm:$src2),
"ror{q}\t{$src2, $dst\|$dst, $src2}",		"ror{q}\t{$src2, $dst\|$dst, $src2}",
[(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],		[(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
IIC_SR>;		IIC_SR>;

// Rotate by 1		// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),		def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",		"ror{b}\t$dst",
[(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],		[(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
craig.topperUnsubmitted Not Done Reply Inline Actions Does rotr by 1 never get emitted or should we really have two patterns here? craig.topper: Does rotr by 1 never get emitted or should we really have two patterns here?
zviAuthorUnsubmitted Not Done Reply Inline Actions Note that for 8-bit operands: rotate right by 1 is same as rotate left by 7. It seems that the canonical form prefers the latter while the target prefers the former. Does this answer your question, Craig? zvi: Note that for 8-bit operands: rotate right by 1 is same as rotate left by 7. It seems that the…
IIC_SR>;		IIC_SR>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),		def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",		"ror{w}\t$dst",
[(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],		[(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
IIC_SR>, OpSize16;		IIC_SR>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),		def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",		"ror{l}\t$dst",
[(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],		[(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
IIC_SR>, OpSize32;		IIC_SR>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),		def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",		"ror{q}\t$dst",
[(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],		[(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
IIC_SR>;		IIC_SR>;
} // Constraints = "$src = $dst", SchedRW		} // Constraints = "$src = $dst", SchedRW

let SchedRW = [WriteShiftLd, WriteRMW] in {		let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {		let Uses = [CL] in {
def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),		def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t{%cl, $dst\|$dst, cl}",		"ror{b}\t{%cl, $dst\|$dst, cl}",
[(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],		[(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
▲ Show 20 Lines • Show All 337 Lines • Show Last 20 Lines

test/CodeGen/X86/rotate.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux -march=x86 \| FileCheck %s		; RUN: llc < %s -mtriple=x86_64-unknown-linux -march=x86 \| FileCheck %s
		RKSimonUnsubmitted Done Reply Inline Actions We have a 64-bit triple and a 32-bit arch - this needs fixing and if possible test on both 32 and 64 bit targets. If possible add i64 tests as well? RKSimon: We have a 64-bit triple and a 32-bit arch - this needs fixing and if possible test on both 32…
		zviAuthorUnsubmitted Done Reply Inline Actions Good catch! Will add the i64 tests. zvi: Good catch! Will add the i64 tests.
		zviAuthorUnsubmitted Not Done Reply Inline Actions Fixed in r283695 zvi: Fixed in r283695

define i32 @rotl32(i32 %A, i8 %Amt) {		define i32 @rotl32(i32 %A, i8 %Amt) {
; CHECK-LABEL: rotl32:		; CHECK-LABEL: rotl32:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl		; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: roll %cl, %eax		; CHECK-NEXT: roll %cl, %eax
; CHECK-NEXT: retl		; CHECK-NEXT: retl
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retl
%D = or i32 %B, %C ; <i32> [#uses=1]		%D = or i32 %B, %C ; <i32> [#uses=1]
ret i32 %D		ret i32 %D
}		}

define i32 @rotr1_32(i32 %A) {		define i32 @rotr1_32(i32 %A) {
; CHECK-LABEL: rotr1_32:		; CHECK-LABEL: rotr1_32:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: roll $31, %eax		; CHECK-NEXT: rorl %eax
; CHECK-NEXT: retl		; CHECK-NEXT: retl
%B = shl i32 %A, 31 ; <i32> [#uses=1]		%B = shl i32 %A, 31 ; <i32> [#uses=1]
%C = lshr i32 %A, 1 ; <i32> [#uses=1]		%C = lshr i32 %A, 1 ; <i32> [#uses=1]
%D = or i32 %B, %C ; <i32> [#uses=1]		%D = or i32 %B, %C ; <i32> [#uses=1]
ret i32 %D		ret i32 %D
}		}

define i16 @rotl16(i16 %A, i8 %Amt) {		define i16 @rotl16(i16 %A, i8 %Amt) {
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retl
%D = or i16 %B, %C ; <i16> [#uses=1]		%D = or i16 %B, %C ; <i16> [#uses=1]
ret i16 %D		ret i16 %D
}		}

define i16 @rotr1_16(i16 %A) {		define i16 @rotr1_16(i16 %A) {
; CHECK-LABEL: rotr1_16:		; CHECK-LABEL: rotr1_16:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax		; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: rolw $15, %ax		; CHECK-NEXT: rorw %ax
; CHECK-NEXT: retl		; CHECK-NEXT: retl
%B = lshr i16 %A, 1 ; <i16> [#uses=1]		%B = lshr i16 %A, 1 ; <i16> [#uses=1]
%C = shl i16 %A, 15 ; <i16> [#uses=1]		%C = shl i16 %A, 15 ; <i16> [#uses=1]
%D = or i16 %B, %C ; <i16> [#uses=1]		%D = or i16 %B, %C ; <i16> [#uses=1]
ret i16 %D		ret i16 %D
}		}

define i8 @rotl8(i8 %A, i8 %Amt) {		define i8 @rotl8(i8 %A, i8 %Amt) {
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retl
%D = or i8 %B, %C ; <i8> [#uses=1]		%D = or i8 %B, %C ; <i8> [#uses=1]
ret i8 %D		ret i8 %D
}		}

define i8 @rotr1_8(i8 %A) {		define i8 @rotr1_8(i8 %A) {
; CHECK-LABEL: rotr1_8:		; CHECK-LABEL: rotr1_8:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al		; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
; CHECK-NEXT: rolb $7, %al		; CHECK-NEXT: rorb %al
; CHECK-NEXT: retl		; CHECK-NEXT: retl
%B = lshr i8 %A, 1 ; <i8> [#uses=1]		%B = lshr i8 %A, 1 ; <i8> [#uses=1]
%C = shl i8 %A, 7 ; <i8> [#uses=1]		%C = shl i8 %A, 7 ; <i8> [#uses=1]
%D = or i8 %B, %C ; <i8> [#uses=1]		%D = or i8 %B, %C ; <i8> [#uses=1]
ret i8 %D		ret i8 %D
}		}