This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/X86/
-
Target/
-
X86/
4/4
X86ISelDAGToDAG.cpp
-
test/CodeGen/X86/
-
CodeGen/
-
X86/
2/2
extract-bits.ll

Differential D56052

X86DAGToDAGISel::matchBitExtract() with truncation (PR36419)
ClosedPublic

Authored by lebedev.ri on Dec 22 2018, 10:28 AM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
andreadb
spatel

Commits

rGfb4eed381d2d: X86DAGToDAGISel::matchBitExtract() with truncation (PR36419)
rL351253: X86DAGToDAGISel::matchBitExtract() with truncation (PR36419)

Summary

Previously in D54095 i have added support for extraction of lshr from X if we are to produce BEXTR.
That was good, but the fix was partial, there was still PR36419.

That pattern can also appear, roughly, when you have a large (64-bit) storage, and the consume bits from it.
It will not be unexpected if you will be doing further computations in 32-bit width.
And then the current code breaks, as the tests show.

The basic idea/pattern here is following:

We have i64 input
We perform i64 right-shift on it.
We truncate that shifted value
We do all further work (masking) in i32

Since we see truncation and not lshr, we give up, and stop trying to extract that right-shift.
BUT. The mask is i32, therefore we can extend both of the operands of the masking (and) to i64
and truncate the result after masking: https://rise4fun.com/Alive/K4B

Name: @bextr64_32_b1 -> @bextr64_32_b0  
  %shiftedval = lshr i64 %val, %numskipbits
  %truncshiftedval = trunc i64 %shiftedval to i32
  %widenumlowbits1 = zext i8 %numlowbits to i32
  %notmask1 = shl nsw i32 -1, %widenumlowbits1
  %mask1 = xor i32 %notmask1, -1
  %res = and i32 %truncshiftedval, %mask1
=>
  %shiftedval = lshr i64 %val, %numskipbits
  %widenumlowbits = zext i8 %numlowbits to i64
  %notmask = shl nsw i64 -1, %widenumlowbits
  %mask = xor i64 %notmask, -1
  %wideres = and i64 %shiftedval, %mask
  %res = trunc i64 %wideres to i32

Thus, we are again able to extract that lshr into BEXTR's control.

Now, the perf (via llvm-exegesis) of the snippet suggests that it is not a good idea:

$ cat /tmp/old.s 
# bextr64_32_b1
# LLVM-EXEGESIS-LIVEIN RSI
# LLVM-EXEGESIS-LIVEIN EDX
# LLVM-EXEGESIS-LIVEIN RDI
movq %rsi, %rcx
shrq %cl, %rdi
shll $8, %edx
bextrl %edx, %edi, %eax
$ cat /tmp/old.s | ./bin/llvm-exegesis -mode=latency -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-1e0082.o
---
mode:            latency
key:             
  instructions:    
    - 'MOV64rr RCX RSI'
    - 'SHR64rCL RDI RDI'
    - 'SHL32ri EDX EDX i_0x8'
    - 'BEXTR32rr EAX EDI EDX'
  config:          ''
  register_initial_values: []
cpu_name:        bdver2
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:    
  - { key: latency, value: 0.6638, per_snippet_value: 2.6552 }
error:           ''
info:            ''
assembled_snippet: 4889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C7C3
...
$ cat /tmp/old.s | ./bin/llvm-exegesis -mode=uops -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-43e346.o
---
mode:            uops
key:             
  instructions:    
    - 'MOV64rr RCX RSI'
    - 'SHR64rCL RDI RDI'
    - 'SHL32ri EDX EDX i_0x8'
    - 'BEXTR32rr EAX EDI EDX'
  config:          ''
  register_initial_values: []
cpu_name:        bdver2
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:    
  - { key: PdFPU0, value: 0, per_snippet_value: 0 }
  - { key: PdFPU1, value: 0, per_snippet_value: 0 }
  - { key: PdFPU2, value: 0, per_snippet_value: 0 }
  - { key: PdFPU3, value: 0, per_snippet_value: 0 }
  - { key: NumMicroOps, value: 1.2571, per_snippet_value: 5.0284 }
error:           ''
info:            ''
assembled_snippet: 4889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C74889F148D3EFC1E208C4E268F7C7C3
...

$ cat /tmp/new.s 
# bextr64_32_b1
# LLVM-EXEGESIS-LIVEIN RDX
# LLVM-EXEGESIS-LIVEIN SIL
# LLVM-EXEGESIS-LIVEIN RDI
shlq $8, %rdx
movzbl %sil, %eax
orq %rdx, %rax
bextrq %rax, %rdi, %rax
$ cat /tmp/new.s | ./bin/llvm-exegesis -mode=latency -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-8944f1.o
---
mode:            latency
key:             
  instructions:    
    - 'SHL64ri RDX RDX i_0x8'
    - 'MOVZX32rr8 EAX SIL'
    - 'OR64rr RAX RAX RDX'
    - 'BEXTR64rr RAX RDI RAX'
  config:          ''
  register_initial_values: []
cpu_name:        bdver2
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:    
  - { key: latency, value: 0.7454, per_snippet_value: 2.9816 }
error:           ''
info:            ''
assembled_snippet: 48C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C7C3
...
$ cat /tmp/new.s | ./bin/llvm-exegesis -mode=uops -snippets-file=-
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-da403c.o
---
mode:            uops
key:             
  instructions:    
    - 'SHL64ri RDX RDX i_0x8'
    - 'MOVZX32rr8 EAX SIL'
    - 'OR64rr RAX RAX RDX'
    - 'BEXTR64rr RAX RDI RAX'
  config:          ''
  register_initial_values: []
cpu_name:        bdver2
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:    
  - { key: PdFPU0, value: 0, per_snippet_value: 0 }
  - { key: PdFPU1, value: 0, per_snippet_value: 0 }
  - { key: PdFPU2, value: 0, per_snippet_value: 0 }
  - { key: PdFPU3, value: 0, per_snippet_value: 0 }
  - { key: NumMicroOps, value: 1.2571, per_snippet_value: 5.0284 }
error:           ''
info:            ''
assembled_snippet: 48C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C748C1E208400FB6C64809D0C4E2F8F7C7C3
...

^ latency increased (worse).

Except maybe not really.
Like with all synthetic benchmarks, they may be misleading.

Let's take a look on some actual real-world hotpath.
In this case it's 'my' RawSpeed's BitStream<>::peekBitsNoFill(), in GoPro VC5 decompressor:

raw.pixls.us-unique/GoPro/HERO6 Black$ /usr/src/googlebenchmark/tools/compare.py -a benchmarks ~/rawspeed/build-clangs1-{old,new}/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR 
RUNNING: /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR --benchmark_display_aggregates_only=true --benchmark_out=/tmp/tmplwbKEM
2018-12-22 21:23:03
Running /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench
Run on (8 X 4012.81 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 3.41, 2.41, 2.03
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                        Time           CPU Iterations  CPUTime,s CPUTime/WallTime     Pixels Pixels/CPUTime Pixels/WallTime Raws/CPUTime Raws/WallTime WallTime,s
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_mean           40 ms         40 ms        128   0.322244          7.96974        12M       37.4457M        298.534M      3.12047       24.8778   0.040465
GOPR9172.GPR/threads:8/real_time_median         39 ms         39 ms        128   0.312606          7.99155        12M        38.387M        306.788M      3.19891       25.5656   0.039115
GOPR9172.GPR/threads:8/real_time_stddev          4 ms          3 ms        128  0.0271557         0.130575          0        2.4941M        21.3909M     0.207842       1.78257   3.81081m
RUNNING: /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench --benchmark_counters_tabular=true --benchmark_min_time=0.00000001 --benchmark_repetitions=128 GOPR9172.GPR --benchmark_display_aggregates_only=true --benchmark_out=/tmp/tmpWAkan9
2018-12-22 21:23:08
Running /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench
Run on (8 X 4013.1 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 3.78, 2.50, 2.06
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                        Time           CPU Iterations  CPUTime,s CPUTime/WallTime     Pixels Pixels/CPUTime Pixels/WallTime Raws/CPUTime Raws/WallTime WallTime,s
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_mean           39 ms         39 ms        128   0.311533          7.97323        12M       38.6828M        308.471M      3.22356        25.706  0.0390928
GOPR9172.GPR/threads:8/real_time_median         38 ms         38 ms        128   0.304231          7.99005        12M       39.4437M        315.527M      3.28698        26.294  0.0380316
GOPR9172.GPR/threads:8/real_time_stddev          3 ms          3 ms        128  0.0229149         0.133814          0       2.26225M        19.1421M     0.188521       1.59517   3.13671m
Comparing /home/lebedevri/rawspeed/build-clangs1-old/src/utilities/rsbench/rsbench to /home/lebedevri/rawspeed/build-clangs1-new/src/utilities/rsbench/rsbench
Benchmark                                                 Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------------------------------
GOPR9172.GPR/threads:8/real_time_pvalue                 0.0000          0.0000      U Test, Repetitions: 128 vs 128
GOPR9172.GPR/threads:8/real_time_mean                  -0.0339         -0.0316            40            39            40            39
GOPR9172.GPR/threads:8/real_time_median                -0.0277         -0.0274            39            38            39            38
GOPR9172.GPR/threads:8/real_time_stddev                -0.1769         -0.1267             4             3             3             3

I.e. this results in roughly -3% improvements in perf.

While this will help PR36419, it won't address it fully.

Diff Detail

Repository: rL LLVM

Event Timeline

lebedev.ri created this revision.Dec 22 2018, 10:28 AM

Herald added a subscriber: courbet. · View Herald TranscriptDec 22 2018, 10:28 AM

low-key ping

Hm, ping.
I'm guessing everyone is busy with funnel-related stuff, on the eve of the 8.0
Or would it be easier if i dropped all the TLDR in the description?

craig.topper added inline comments.Jan 14 2019, 2:12 PM

lib/Target/X86/X86ISelDAGToDAG.cpp
2822	Should we ensure the truncate input has only one use?

lebedev.ri marked 2 inline comments as done.Jan 14 2019, 2:23 PM

lebedev.ri added inline comments.

lib/Target/X86/X86ISelDAGToDAG.cpp
2822	Hm, since we only take that branch in BMI1's case, it would be consistent with the rest of the one-use checks, so i guess we should, yes. Will do.
test/CodeGen/X86/extract-bits.ll
1832	One more thing is that 'control' being calculated as i64, while we only care about low 32 bits. Will take a look, but not in this patch.

Diffusion mentioned this in rL351182: [NFC][X86] extract-bits.ll: add test with truncation with extra-use..Jan 15 2019, 2:40 AM

Rebased ontop of patch with extra-use trunk.
Did not add the use check yet..

Only skip one-use truncations.

lebedev.ri mentioned this in D56715: X86DAGToDAGISel::matchBitExtract(): prepare 'control' in 32 bits.Jan 15 2019, 4:53 AM

lebedev.ri added a child revision: D56715: X86DAGToDAGISel::matchBitExtract(): prepare 'control' in 32 bits.

craig.topper added inline comments.Jan 15 2019, 11:31 AM

lib/Target/X86/X86ISelDAGToDAG.cpp
2824	Just use X.hasOneUse()? There's a special case in checkOneUse for hasBMI2, but that doesnt' apply here. So let's just write the most direct thing

Address review feedback - use X.hasOneUse() directly.

lib/Target/X86/X86ISelDAGToDAG.cpp
2824	Right. I was going for consistency so one would not wonder why this would be using `hasOneUse()`, while others would be using those helper lambdas.

LGTM

This revision is now accepted and ready to land.Jan 15 2019, 11:45 AM

In D56052#1358320, @craig.topper wrote:

LGTM

Thank you for the review!

test/CodeGen/X86/extract-bits.ll
1832	Submitted as D56715.

Closed by commit rL351253: X86DAGToDAGISel::matchBitExtract() with truncation (PR36419) (authored by lebedevri). · Explain WhyJan 15 2019, 1:35 PM

This revision was automatically updated to reflect the committed changes.

Diffusion mentioned this in rL353073: [X86] X86DAGToDAGISel::matchBitExtract(): prepare 'control' in 32 bits.Feb 4 2019, 11:04 AM

lebedev.ri mentioned this in rGb7ecc9b6241b: [X86] X86DAGToDAGISel::matchBitExtract(): prepare 'control' in 32 bits.Feb 4 2019, 11:05 AM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelDAGToDAG.cpp

36 lines

test/

CodeGen/

X86/

extract-bits.ll

77 lines

Diff 179429

lib/Target/X86/X86ISelDAGToDAG.cpp

Show First 20 Lines • Show All 2,810 Lines • ▼ Show 20 Lines	if (matchLowBitMask(Mask)) {
if (!matchLowBitMask(Mask))		if (!matchLowBitMask(Mask))
return false;		return false;
}		}
} else if (!matchPatternD(Node))		} else if (!matchPatternD(Node))
return false;		return false;

SDLoc DL(Node);		SDLoc DL(Node);

		// If we do NOT have BMI2, let's find out if the if the 'X' is logically
		// shifted (potentially with trunc inbetween), and if so look past truncation.
		MVT XVT = NVT;
		if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
		craig.topperUnsubmitted Done Reply Inline Actions Should we ensure the truncate input has only one use? craig.topper: Should we ensure the truncate input has only one use?
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Hm, since we only take that branch in BMI1's case, it would be consistent with the rest of the one-use checks, so i guess we should, yes. Will do. lebedev.ri: Hm, since we only take that branch in BMI1's case, it would be consistent with the rest of the…
		X.getOperand(0).getOpcode() == ISD::SRL) {
		assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
		craig.topperUnsubmitted Done Reply Inline Actions Just use X.hasOneUse()? There's a special case in checkOneUse for hasBMI2, but that doesnt' apply here. So let's just write the most direct thing craig.topper: Just use X.hasOneUse()? There's a special case in checkOneUse for hasBMI2, but that doesnt'…
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Right. I was going for consistency so one would not wonder why this would be using `hasOneUse()`, while others would be using those helper lambdas. lebedev.ri: Right. I was going for consistency so one would not wonder why this would be using `hasOneUse…
		X = X.getOperand(0);
		XVT = X.getSimpleValueType();
		assert(XVT == MVT::i64 && "Expected truncation from i64");
		}

SDValue OrigNBits = NBits;		SDValue OrigNBits = NBits;
if (NBits.getValueType() != NVT) {		if (NBits.getValueType() != XVT) {
// Truncate the shift amount.		// Truncate the shift amount.
NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);		NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
insertDAGNode(*CurDAG, OrigNBits, NBits);		insertDAGNode(*CurDAG, OrigNBits, NBits);

// Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit)		// Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit)
// register. All the other bits are undefined, we do not care about them.		// register. All the other bits are undefined, we do not care about them.
SDValue ImplDef =		SDValue ImplDef =
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);		SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
insertDAGNode(*CurDAG, OrigNBits, ImplDef);		insertDAGNode(*CurDAG, OrigNBits, ImplDef);
NBits =		NBits =
CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);		CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits);
insertDAGNode(*CurDAG, OrigNBits, NBits);		insertDAGNode(*CurDAG, OrigNBits, NBits);
}		}

if (Subtarget->hasBMI2()) {		if (Subtarget->hasBMI2()) {
// Great, just emit the the BZHI..		// Great, just emit the the BZHI..
SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);		SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
ReplaceNode(Node, Extract.getNode());		ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());		SelectCode(Extract.getNode());
return true;		return true;
}		}

// Else, emitting BEXTR requires one more step.		// Else, emitting BEXTR requires one more step.
// The 'control' of BEXTR has the pattern of:		// The 'control' of BEXTR has the pattern of:
// [15...8 bit][ 7...0 bit] location		// [15...8 bit][ 7...0 bit] location
// [ bit count][ shift] name		// [ bit count][ shift] name
// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11		// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11

// Shift NBits left by 8 bits, thus producing 'control'.		// Shift NBits left by 8 bits, thus producing 'control'.
// This makes the low 8 bits to be zero.		// This makes the low 8 bits to be zero.
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);		SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8);		SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8);
insertDAGNode(*CurDAG, OrigNBits, Control);		insertDAGNode(*CurDAG, OrigNBits, Control);

// If the 'X' is logically shifted, we can fold that shift into 'control'.		// If the 'X' is logically shifted, we can fold that shift into 'control'.
if (X.getOpcode() == ISD::SRL) {		if (X.getOpcode() == ISD::SRL) {
SDValue ShiftAmt = X.getOperand(1);		SDValue ShiftAmt = X.getOperand(1);
X = X.getOperand(0);		X = X.getOperand(0);

assert(ShiftAmt.getValueType() == MVT::i8 &&		assert(ShiftAmt.getValueType() == MVT::i8 &&
"Expected shift amount to be i8");		"Expected shift amount to be i8");

// Now, zero-extend the shift amount. The bits 8...15 must be zero!		// Now, zero-extend the shift amount. The bits 8...15 must be zero!
SDValue OrigShiftAmt = ShiftAmt;		SDValue OrigShiftAmt = ShiftAmt;
ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, NVT, ShiftAmt);		ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt);
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);		insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);

// And now 'or' these low 8 bits of shift amount into the 'control'.		// And now 'or' these low 8 bits of shift amount into the 'control'.
Control = CurDAG->getNode(ISD::OR, DL, NVT, Control, ShiftAmt);		Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt);
insertDAGNode(*CurDAG, OrigNBits, Control);		insertDAGNode(*CurDAG, OrigNBits, Control);
}		}

// And finally, form the BEXTR itself.		// And finally, form the BEXTR itself.
SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control);		SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);

		// The 'X' was originally truncated. Do that now.
		if (XVT != NVT) {
		insertDAGNode(*CurDAG, OrigNBits, Extract);
		Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
		}

ReplaceNode(Node, Extract.getNode());		ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());		SelectCode(Extract.getNode());

return true;		return true;
}		}

// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.		// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {		MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
▲ Show 20 Lines • Show All 1,001 Lines • Show Last 20 Lines

test/CodeGen/X86/extract-bits.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 1,822 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: decl %eax			; X64-NOBMI-NEXT: decl %eax
	; X64-NOBMI-NEXT: andl %edi, %eax			; X64-NOBMI-NEXT: andl %edi, %eax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_a1:			; X64-BMI1NOTBM-LABEL: bextr64_32_a1:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
				lebedev.riAuthorUnsubmitted Done Reply Inline Actions One more thing is that 'control' being calculated as i64, while we only care about low 32 bits. Will take a look, but not in this patch. lebedev.ri: One more thing is that 'control' being calculated as i64, while we only care about low 32 bits.
				lebedev.riAuthorUnsubmitted Done Reply Inline Actions Submitted as D56715. lebedev.ri: Submitted as D56715.
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_a1:			; X64-BMI1BMI2-LABEL: bextr64_32_a1:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shifted = lshr i64 %val, %numskipbits			%shifted = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 80 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: decl %eax			; X64-NOBMI-NEXT: decl %eax
	; X64-NOBMI-NEXT: andl %edi, %eax			; X64-NOBMI-NEXT: andl %edi, %eax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_a2:			; X64-BMI1NOTBM-LABEL: bextr64_32_a2:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_a2:			; X64-BMI1BMI2-LABEL: bextr64_32_a2:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shifted = lshr i64 %val, %numskipbits			%shifted = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 1,565 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: notl %eax			; X64-NOBMI-NEXT: notl %eax
	; X64-NOBMI-NEXT: andl %edi, %eax			; X64-NOBMI-NEXT: andl %edi, %eax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_b1:			; X64-BMI1NOTBM-LABEL: bextr64_32_b1:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_b1:			; X64-BMI1BMI2-LABEL: bextr64_32_b1:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shiftedval = lshr i64 %val, %numskipbits			%shiftedval = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: notl %eax			; X64-NOBMI-NEXT: notl %eax
	; X64-NOBMI-NEXT: andl %edi, %eax			; X64-NOBMI-NEXT: andl %edi, %eax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_b2:			; X64-BMI1NOTBM-LABEL: bextr64_32_b2:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_b2:			; X64-BMI1BMI2-LABEL: bextr64_32_b2:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shiftedval = lshr i64 %val, %numskipbits			%shiftedval = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 2,348 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: shrl %cl, %eax			; X64-NOBMI-NEXT: shrl %cl, %eax
	; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax			; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_c1:			; X64-BMI1NOTBM-LABEL: bextr64_32_c1:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_c1:			; X64-BMI1BMI2-LABEL: bextr64_32_c1:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shifted = lshr i64 %val, %numskipbits			%shifted = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: shrl %cl, %eax			; X64-NOBMI-NEXT: shrl %cl, %eax
	; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax			; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_c2:			; X64-BMI1NOTBM-LABEL: bextr64_32_c2:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_c2:			; X64-BMI1BMI2-LABEL: bextr64_32_c2:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shifted = lshr i64 %val, %numskipbits			%shifted = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 1,505 Lines • ▼ Show 20 Lines
	; X64-NOBMI-NEXT: movl %edx, %ecx			; X64-NOBMI-NEXT: movl %edx, %ecx
	; X64-NOBMI-NEXT: shll %cl, %eax			; X64-NOBMI-NEXT: shll %cl, %eax
	; X64-NOBMI-NEXT: shrl %cl, %eax			; X64-NOBMI-NEXT: shrl %cl, %eax
	; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax			; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-NOBMI-NEXT: retq			; X64-NOBMI-NEXT: retq
	;			;
	; X64-BMI1NOTBM-LABEL: bextr64_32_d1:			; X64-BMI1NOTBM-LABEL: bextr64_32_d1:
	; X64-BMI1NOTBM: # %bb.0:			; X64-BMI1NOTBM: # %bb.0:
	; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx			; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx
	; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx			; X64-BMI1NOTBM-NEXT: shlq $8, %rdx
	; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi			; X64-BMI1NOTBM-NEXT: movzbl %sil, %eax
	; X64-BMI1NOTBM-NEXT: shll $8, %edx			; X64-BMI1NOTBM-NEXT: orq %rdx, %rax
	; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax			; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax
				; X64-BMI1NOTBM-NEXT: # kill: def $eax killed $eax killed $rax
	; X64-BMI1NOTBM-NEXT: retq			; X64-BMI1NOTBM-NEXT: retq
	;			;
	; X64-BMI1BMI2-LABEL: bextr64_32_d1:			; X64-BMI1BMI2-LABEL: bextr64_32_d1:
	; X64-BMI1BMI2: # %bb.0:			; X64-BMI1BMI2: # %bb.0:
	; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax			; X64-BMI1BMI2-NEXT: shrxq %rsi, %rdi, %rax
	; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax			; X64-BMI1BMI2-NEXT: bzhil %edx, %eax, %eax
	; X64-BMI1BMI2-NEXT: retq			; X64-BMI1BMI2-NEXT: retq
	%shifted = lshr i64 %val, %numskipbits			%shifted = lshr i64 %val, %numskipbits
	▲ Show 20 Lines • Show All 653 Lines • Show Last 20 Lines