This is an archive of the discontinued LLVM Phabricator instance.

Improved X86-FMA3 mem-folding & coalescing
ClosedPublic

Authored by v_klochkov on Sep 29 2015, 1:50 PM.

Download Raw Diff

Details

Reviewers

qcolombet
ab

Commits

rG4731bea3e5e4: Improved the operands commute transformation for X86-FMA3 instructions. All 3…
rL252335: Improved the operands commute transformation for X86-FMA3 instructions.

Summary

`
This change-set was initially included into a bigger change-set http://reviews.llvm.org/D11370
but X86 FMA3 specific changes were removed from D11370 to simplify that change-set.

The changes proposed here implement optimal form selection (213/312/231)
for X86 FMA3 instructions, and help to improve Memory-operand folding and Coalescing
optimizations performed for X86 FMA instructions.

Better Memory-folding and Coalescing optimizations help to reduce
registers pressure. Improvement from the changes can be shown on such
an example:

    for (int i = 0; i < N; i += 1) {
        val1 = _mm_and_pd(val1, val5);
        val2 = _mm_and_pd(val2, val6);
        val3 = _mm_and_pd(val3, val7);
        val4 = _mm_and_pd(val4, val8);
        val5 = _mm_xor_pd(val1, val5);
        val6 = _mm_xor_pd(val2, val6);
        val7 = _mm_xor_pd(val3, val7);
        val8 = _mm_xor_pd(val4, val8);

		v_accu1 = _mm_fmadd_pd(v_accu1, x1_arr[i], val1);
        v_accu2 = _mm_fmadd_pd(v_accu2, x2_arr[i], val2);
        v_accu3 = _mm_fmadd_pd(v_accu3, x3_arr[i], val3);
        v_accu4 = _mm_fmadd_pd(v_accu4, x4_arr[i], val4);
        v_accu5 = _mm_fmadd_pd(v_accu5, x5_arr[i], val5);
        v_accu6 = _mm_fmadd_pd(v_accu6, x6_arr[i], val6);
        v_accu7 = _mm_fmadd_pd(v_accu7, x7_arr[i], val7);
        v_accu8 = _mm_fmadd_pd(v_accu8, x8_arr[i], val8);
    }


    ASM code BEFORE the changes:
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovapd %xmm0, -56(%rsp)        # 16-byte Spill
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm12, %xmm5
        vandpd  %xmm6, %xmm9, %xmm6
        vmovapd -40(%rsp), %xmm10       # 16-byte Reload
        vandpd  %xmm10, %xmm13, %xmm10
        vmovapd %xmm10, -40(%rsp)       # 16-byte Spill
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm12, %xmm12
        vxorpd  %xmm6, %xmm9, %xmm9
        vxorpd  %xmm10, %xmm13, %xmm13
        vmovapd %xmm8, %xmm0
        vmovapd x1_arr+8192(%rcx), %xmm8
        vmovapd -24(%rsp), %xmm1        # 16-byte Reload
        vfmadd213pd     %xmm7, %xmm8, %xmm1
        vmovapd %xmm1, -24(%rsp)        # 16-byte Spill
        vmovapd %xmm0, %xmm8
        vmovapd x2_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm5, %xmm1, %xmm4
        vmovapd x3_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm6, %xmm1, %xmm8
        vmovapd x4_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm10, %xmm1, %xmm11
        vmovapd -56(%rsp), %xmm0        # 16-byte Reload
        vmovapd x5_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm3, %xmm1, %xmm15
        vmovapd x6_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm12, %xmm1, %xmm0
        vmovapd x7_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm9, %xmm1, %xmm2
        vmovapd x8_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm13, %xmm1, %xmm14
        addq    $16, %rcx
        jne     .LBB1_2

        ASM code WITH the new changes (about 30% faster):
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm2, %xmm5
        vandpd  %xmm6, %xmm0, %xmm6
        vandpd  %xmm1, %xmm4, %xmm1
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm2, %xmm2
        vxorpd  %xmm6, %xmm0, %xmm0
        vfmadd132pd     x1_arr+8192(%rcx), %xmm7, %xmm15
        vfmadd132pd     x2_arr+8192(%rcx), %xmm5, %xmm8
        vfmadd132pd     x3_arr+8192(%rcx), %xmm6, %xmm9
        vfmadd132pd     x4_arr+8192(%rcx), %xmm1, %xmm10
        vfmadd132pd     x5_arr+8192(%rcx), %xmm3, %xmm14
        vfmadd132pd     x6_arr+8192(%rcx), %xmm2, %xmm11
        vfmadd132pd     x7_arr+8192(%rcx), %xmm0, %xmm12
        vxorpd  %xmm1, %xmm4, %xmm4
        vfmadd132pd     x8_arr+8192(%rcx), %xmm4, %xmm13
        addq    $16, %rcx
        jne     .LBB1_2

This change-set also fixed an existing correctness problem caused
by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics.

   
   For FMA intrinsic call:

       __m128d foo(__m128d a, __m128d b, __m128d c) {
	     // must return XMM0={b[127:64], a[63:0]*b[63:0]+c[63:0]}
		 // but currently returned value is XMM0={a[127:64], a[63:0]*b[63:0]+c[63:0]}
	     return _mm_fmadd_sd(b, a, c);
	   }

The Coalescer/TwoAddressInstructionPass swapped the 1st and 2nd operands
of SCALAR FMA and invalidated the higher bits of the result returned
from foo().
The change-set fixes that and prohibits swapping 1st and 2nd operands
of scalar FMAs.

Swapping 1st and 2nd operands of scalar FMAs may be possible and legal,
but only after special analysis of FMA users. Such optimization/analysis
can be implemented separately.
Another way is to separate FMA opcodes generated for FP operations
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.

With regards to performance these changes are very good for vector/packed FMAs
(all source operands became commutable),
and neutral for scalar FMAs:
a) prohibit/disable commuting 1st and 2nd operands,
b) enable commuting 2nd and 3rd operands.
`

Diff Detail

Event Timeline

v_klochkov updated this revision to Diff 36032.Sep 29 2015, 1:50 PM

v_klochkov retitled this revision from to Improved X86-FMA3 mem-folding & coalescing.

v_klochkov updated this object.

v_klochkov added a reviewer: qcolombet.

v_klochkov added a subscriber: llvm-commits.

Another way is to separate FMA opcodes generated for FP operations
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.

You're right, _Int would work (and is intended for exactly this situation), but I disagree that we can avoid fixing that here. I'm probably the one who hates _Int the most, but currently, the fma scalar intrinsic patterns seem just plain wrong, and working around that here isn't proper, IMHO. You should add the _Int instructions before landing this patch.

As for getting rid of _Int in the long term, we have https://llvm.org/bugs/show_bug.cgi?id=23449 !

This revision now requires changes to proceed.Sep 30 2015, 2:36 PM

Hi Slava,

Thanks on working on this.

Two main things:

Could you explain the structure you used to describe the dependencies between the FMA opcode?

I do not want to reverse engineer it to review the patch!

The bug you fix here for the operand we shouldn’t commute for the intrinsic lowering is, IMO, separated from improving the lowering for the commutable operand.

I.e., please address that in a separate patch. As for direction, you could use the *_Int approach, or better, but more involved, model correctly the intrinsic lowering by adding a subregister for FP32 from the VR128 and use insert_subreg.

Cheers,
-Quentin

llvm/lib/Target/X86/X86InstrInfo.cpp
3030	return false;
3032	llvm_unreachable(“Opcode not handled by the switch")
3272	I would rewrite the check to make the CommuteAnyOperandIndex the first discrimination: if (SrcOpIdx1 != CommuteAnyOperandIndex && (SrcOpIdx1 < 1 \|\| SrcOpIdx1 > RegOpsNum)) return false; Same for SrcOpIdx2.
3276	Add a comment saying that we look for two registers operands, those are the ones that can be commuted regardless of the FMA opcode. We will adjust the opcode later.
3284	the last register operand of the instruction.
3311	Add a comment along the line: // Check if we can adjust the opcode so that the registers we change preserve the semantic.
3321	Please explain the structure you are using here. In particular, what are those dependencies and how do you represent them.
3425	Get rid of this check. This is a hack to workaround a bug. The bug should be fixed independently of the improvement for the lowering for commutable operands.
3428	Canonicalize SrcOpIdx1 and SrcOpIdx2 to avoid these duplicated checks.
llvm/lib/Target/X86/X86InstrInfo.h
270	\p SrcOpIdx1 and \p SrcOpIdx2
llvm/test/CodeGen/X86/fma_patterns.ll
180	We shouldn’t regress those.

cameron.mcinally added a subscriber: cameron.mcinally.Sep 30 2015, 8:43 PM

Ahmed, Quentin,
Thank you for the quick code-review.

I am ok with having the correctness fix for FMAs to be arranged as a separate change-set.
The correctness fix is removed from this change-set.

Also, I did some additional changes + renaming + documenting in
getFMA3OpcodeToCommuteOperands() to make the code look simpler.

I would like to land this fix and then to work on the correctness problem
that exists for scalar FMA intrinsics.

The simplest way is to add *_Int opcodes.

I am not sure I understood this idea

("adding a subregister for FP32 from the VR128 and use insert_subreg")

If there is a precedence (i.e. some similar scalar SIMD instruction) where that approach is used,
then I can try using that approach for FMAs.

Thank you,
Slava

I also did additional changes accordingly to reviewers' recommendations.

llvm/lib/Target/X86/X86InstrInfo.cpp
3030	Fixed.
3032	Fixed.
3272	I agree, your version looks a bit more clear. Fixed.
3276	Ok, I added a comment.
3284	It is interesting that I added the word "register" here when made changes for your previous comment and only then noticed this comment asking me to do exactly the same change. Fixed.
3311	Ok, added a comment.
3321	This is just an array after I removed IsScalar property. I changed the comment to make it more clear.
3425	I removed this check from this change-set and updated the OpcodeAlts struct. BTW, I would not call this check a hack. It was rather a pessimistic correctness check.
3428	Ok, Fixed. SrcOpIdx1 has the lowest index now to simplify the next checks.
llvm/lib/Target/X86/X86InstrInfo.h
270	Fixed. I did not know about \p. Thank you for letting me know.
llvm/test/CodeGen/X86/fma_patterns.ll
180	Ok, Fixed.

Hi Slava,

Almost there. I think we would benefit from a bit of refactoring.
Tell me if you disagree.

Thanks,
-Quentin

llvm/lib/Target/X86/X86InstrInfo.cpp
3327	Excellent!
3402	Maybe put three and adapt the comparison.
3405	Initialize FormIndex to LastFromIndex.
3406	Use for (GroupIndex = 0; GroupIndex < OpcodeGroupsNum && LastFormIndex != LastFormIndex; GroupIndex++)
3407	Use < LastFormIndex
3413	Remove this block.
3417	Use ==
3423	No need for {}
3430	I like the comments! Keep them, but I believe we can refactor the code a bit. What you basically have is given the indices, you map each FormIndex to a new FormIndex and you have three choices each time. It looks to me like we could do save this table statically like static const unsigned Mapping[][3] = { // SrcOpIdx1 == 1 && SrcOpIdx2 == 2 // FMA132 A, C, b; ==> FMA231 C, A, b; // FMA213 B, A, c; ==> FMA213 A, B, c; // FMA231 C, A, b; ==> FMA132 A, C, b; { Form231Index, Form213Index, Form132Index }, // (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) // etc. // (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) }; unsigned Case; if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) Case = 0; else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) Case = 1; else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) Case = 2; else assert(“Invalid commutable indices for FMA”); // or return 0; RetOpc == Mapping[Case][FormIndex];

Additional minor changes + code-restructuring in getFMA3OpcodeToCommuteOperands().

Hi Quentin,

Thank you for the very useful comments.
I updated the change-set. Please check the additional changes.

Thank you,
Slava

llvm/lib/Target/X86/X86InstrInfo.cpp
3402	Ok, but using 3 instead of 2 required renaming LastFormIndex to FormsNum.
3406	Ok. BTW, this change required a fix for GroupIndex after the loop (do GroupIndex-- once).
3430	This is a great idea! I did the corresponding changes.

Hi Slava,

LGTM.

Regarding fixing the commute bug, forget about the sub register thing I was talking about. There is no precedence and on a second thought I am not sure it is the right to do. In other words, stick to the *_Int approach when you’ll get to it.

Thanks,
-Quentin

RKSimon added a subscriber: RKSimon.Oct 10 2015, 7:15 AM

RKSimon added inline comments.

llvm/test/CodeGen/X86/fma-commute-x86.ll
5	Please can you regenerate with update_llc_test_checks.py to ensure that no extra mov or other instructions are being included.

Hi Quentin,

I do not have permissions for committing changes to LLVM trunc.
Andy (akaylor) who committed the changes for me previously reasonable refused to commit
this change-set as it is now because of Ahmed Bougacha’s comment:

“IMHO” Implement FMA*_Int instructions before landing this change-set.”

That is probably good move as the current fix potentially can produce regressions on tests where the operand 1 and 3 of scalar FMA intrinsics should not be commuted but can be commuted by the new changes.
Also, my senior colleges recommended to implement FMA*_Int instructions first too.

I am ready to submit a new code-review tracker for FMA*_Int instructions.
The corresponding changes are already implemented and tested locally.

Adding FMA*_Int instructions will cause conflicts in X86InstrFMA.td file.

Do you agree with the following plan?

To add FMA*_Int instructions first;
To update this change-set (D13269), i.e. to resolve conflicts in X86InstrFMA.td, and ask for additional review for changes in that file.

In order to minimize the patch for (1), the commute optimization for FMA*_Int should be implemented as a separate change-set.

I am asking you as Ahmed stopped answering in this code-review tracker and also did not answer
to e-mails sent to him separately.

Thanks you,
Slava

Slava,

That plan sounds good to me; I think that's what both Quentin and myself imagined in our original replies.

Please go ahead and submit the patches for _Int FMA instructions.
As for this review, I'll second Simon's comment regarding the tests: please use the utils/update_llc_test_checks.py script. It helps generate CHECKS lines in a way that's both strict and maintainable.

Thanks for working on this, and sorry for the delay!
-Ahmed

v_klochkov mentioned this in D13710: New X86 FMA3*_Int opcodes for scalar FMA intrinsics..Oct 13 2015, 4:00 PM

Hi Ahmed,

Thank you for the answer.

I have submitted a new patch (FMA**_Int opcodes) for code-review.
http://reviews.llvm.org/D13710
Please see (D13710) for details.

So, this (D132269) patch is waiting for code-review/approval/checkin of the changes for (D13710) now.

Thank you,
Slava

Resolved the conflicts in X86InstrFMA.td and updated the fma-commute-x86.ll test.

Hi,

I created the FMA*_Int opcodes in the patch for ( D13710 ) and it has been committed to LLVM trunc.
That patch conflicted with the changes I did here in X86InstrFMA.td.
Thus, I had to update my local workspace and upload the new RE-BASED patch this time.

Please review the updated changes in X86InstrFMA.td.
I would like to comment some additional changes I did to resolve the conflicts and to simplify the opcode definitions:

The parameters 'IsRVariantCommutable' and 'IsMVariantCommutable' were just removed because otherwise, they would be always set to 1.
Moved some comments from fma3{p,s}_forms multiclasses to fma3{p,s}_rm multiclasses. (the multiclasses fma3{p,s}_forms stopped mentioning the commute features, so having those comments there seemed not quite appropriate).

Also, reviewers asked me to use update_llc_test_checks.py for the new test fma-commute-x86.ll.
For some unknown reasons that tool did not work for me, it printed error for any input test.
So, I just added the better checks to the test manually.

This patch does not implement commute transformations for FMA*_Int opcodes.
That can/should be done in a separate patch.

Thank you,
Slava

qcolombet added inline comments.Nov 5 2015, 1:41 PM

llvm/lib/Target/X86/X86InstrFMA.td
37	Why are you changing the hasSideEffects semantic here? I am not saying it is false, it just seems out of scope of what is necessary for this patch. What I am missing?
57	Although that is not incorrect, why do you get rid of that {? You did the same later in that patch. Again, this is not wrong, but this kind of clean-ups adds noise to the review. I.e., if you want to do such clean-ups, which I am not sure I agree, that should be a separate patch.
147–148	Ditto.
151	Ditto.

Please accept my apologies for adding noise to the code-review process.
Ok, no more cleaup-ups at the late code-review phases in future.

I could cancel the clean-up changes that I did, but I am not sure that will help to the review process now, i.e. when I already uploaded them.
If I should remove the clean-ups, no problems, I'll do that. Please let me know.

Regarding the '{' and '}' used for 'let' statements and removed from Nov05 patch...
Quentin, from you comment I did not understand if you disagree with removal of the braces.
Please let me know if the braces should be restored, I'll update the patch.

Thank you,
Slava

llvm/lib/Target/X86/X86InstrFMA.td
37	If compare the patches from Nov05 and Oct05, then I just moved the setting of 'hasSideEffects = 0' from line 64 (Oct05) to here. That made the fma3p_forms multiclass more compact. It did not change semantics comparing to Oct05-patch. Sorry if it added noise to the code-review process.
57	I am sorry for adding noise to the code-review process by doing clean ups in patches fixing already uploaded patches, i.e. like I did here. So, I shouldn't have removed 'IsMVariantCommutable' and 'IsRvariantCommutable'. After I already removed those parameters, and set isCommutable = 1 at the line 18, I had to choose between: a) to update the line 57 (Oct05 rev): "// Constraints = "$src1 = $dst" --> "// Constraints = "$src1 = $dst, isCommutable = 1" or b) to just remove the line 57 and eliminate the need in updating it in future when/if the 'let' statement above is changed again. Both variants required minor changes and I chose (b). Do you recommend restoring the '{' and '} // ....' used for 'let' statements?

Hi Slava,

Thanks for the clarifications.
And don’t worry about the “noise” part, I was just mentioning it for future reference :).

from you comment I did not understand if you disagree with removal of the braces.

Heh, that’s the thing, I had mixed feeling about that, but now, I have made my mind and I am fine with the patch with the additional clean-ups you did.

Thanks for your patience and all your work on that!

Do you have commit access now or do you want me to commit on your behalf?

Cheers,
-Quentin

llvm/lib/Target/X86/X86InstrFMA.td
37	Oh, I was comparing from the initial source to the current update, i.e., unless I am mistaken there is a semantic change: defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; That being said, I believe the change is correct and I am fine with it being embedded in this patch.
57	That’s fine, we can remove those.

Closed by commit rL252335: Improved the operands commute transformation for X86-FMA3 instructions. (authored by akaylor). · Explain WhyNov 6 2015, 11:49 AM

This revision was automatically updated to reflect the committed changes.

v_klochkov mentioned this in D14550: X86-FMA3: Implemented commute transformations for FMA*_Int instructions.Nov 10 2015, 12:43 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86InstrFMA.td

91 lines

X86InstrInfo.h

40 lines

X86InstrInfo.cpp

352 lines

test/

CodeGen/

X86/

fma-commute-x86.ll

312 lines

fma_patterns.ll

24 lines

Diff 36032

llvm/lib/Target/X86/X86InstrFMA.td

Show All 28 Lines	def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
[(set VR128:$dst, (OpVT128 (Op VR128:$src2,		[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
VR128:$src1, VR128:$src3)))]>;		VR128:$src1, VR128:$src3)))]>;

let mayLoad = 1, isCommutable = IsMVariantCommutable in		let mayLoad = 1, isCommutable = IsMVariantCommutable in
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),		def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),		(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),		"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,		[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
		qcolombetUnsubmitted Not Done Reply Inline Actions Why are you changing the hasSideEffects semantic here? I am not saying it is false, it just seems out of scope of what is necessary for this patch. What I am missing? qcolombet: Why are you changing the hasSideEffects semantic here? I am not saying it is false, it just…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions If compare the patches from Nov05 and Oct05, then I just moved the setting of 'hasSideEffects = 0' from line 64 (Oct05) to here. That made the fma3p_forms multiclass more compact. It did not change semantics comparing to Oct05-patch. Sorry if it added noise to the code-review process. v_klochkov: If compare the patches from Nov05 and Oct05, then I just moved the setting of 'hasSideEffects =…
		qcolombetUnsubmitted Not Done Reply Inline Actions Oh, I was comparing from the initial source to the current update, i.e., unless I am mistaken there is a semantic change: defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; That being said, I believe the change is correct and I am fine with it being embedded in this patch. qcolombet: Oh, I was comparing from the initial source to the current update, i.e., unless I am mistaken…
(MemFrag128 addr:$src3))))]>;		(MemFrag128 addr:$src3))))]>;

let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in		let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),		def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),		(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),		"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,		[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
VR256:$src3)))]>, VEX_L;		VR256:$src3)))]>, VEX_L;

let mayLoad = 1, isCommutable = IsMVariantCommutable in		let mayLoad = 1, isCommutable = IsMVariantCommutable in
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),		def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),		(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),		"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
[(set VR256:$dst,		[(set VR256:$dst,
(OpVT256 (Op VR256:$src2, VR256:$src1,		(OpVT256 (Op VR256:$src2, VR256:$src1,
(MemFrag256 addr:$src3))))]>, VEX_L;		(MemFrag256 addr:$src3))))]>, VEX_L;
}		}
} // Constraints = "$src1 = $dst"		} // Constraints = "$src1 = $dst"
qcolombetUnsubmitted Not Done Reply Inline Actions Although that is not incorrect, why do you get rid of that {? You did the same later in that patch. Again, this is not wrong, but this kind of clean-ups adds noise to the review. I.e., if you want to do such clean-ups, which I am not sure I agree, that should be a separate patch. qcolombet: Although that is not incorrect, why do you get rid of that {? You did the same later in that…
v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions I am sorry for adding noise to the code-review process by doing clean ups in patches fixing already uploaded patches, i.e. like I did here. So, I shouldn't have removed 'IsMVariantCommutable' and 'IsRvariantCommutable'. After I already removed those parameters, and set isCommutable = 1 at the line 18, I had to choose between: a) to update the line 57 (Oct05 rev): "// Constraints = "$src1 = $dst" --> "// Constraints = "$src1 = $dst, isCommutable = 1" or b) to just remove the line 57 and eliminate the need in updating it in future when/if the 'let' statement above is changed again. Both variants required minor changes and I chose (b). Do you recommend restoring the '{' and '} // ....' used for 'let' statements? v_klochkov: I am sorry for adding noise to the code-review process by doing clean ups in patches fixing…
qcolombetUnsubmitted Not Done Reply Inline Actions That’s fine, we can remove those. qcolombet: That’s fine, we can remove those.

multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy,		string OpcodeStr, string PackTy,
PatFrag MemFrag128, PatFrag MemFrag256,		PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {		SDNode Op, ValueType OpTy128, ValueType OpTy256> {
// For 213, both the register and memory variant are commutable.		let hasSideEffects = 0 in {
// Indeed, the commutable operands are 1 and 2 and both live in registers		// For 213, both the register and memory variants are commutable.
// for both variants.		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 213 --> 213(no changes);
		// operands 1 and 3 (register forms only): 213 --> 231;
		// operands 2 and 3 (register forms only): 213 --> 132.
defm r213 : fma3p_rm<opc213,		defm r213 : fma3p_rm<opc213,
!strconcat(OpcodeStr, "213", PackTy),		!strconcat(OpcodeStr, "213", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 1,		/* IsMVariantCommutable */ 1,
Op>;		Op>;
let hasSideEffects = 0 in {		// For 132, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 132 --> 231;
		// operands 1 and 3 (register forms only): 132 --> 132(no changes);
		// operands 2 and 3 (register forms only): 132 --> 213.
defm r132 : fma3p_rm<opc132,		defm r132 : fma3p_rm<opc132,
!strconcat(OpcodeStr, "132", PackTy),		!strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;		MemFrag128, MemFrag256, OpTy128, OpTy256,
// For 231, only the register variant is commutable.		/* IsRVariantCommutable */ 1,
		/* IsMVariantCommutable */ 1>;
		// For 231, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
// For the memory variant the folded operand must be in 3. Thus,		// For the memory variant the folded operand must be in 3. Thus,
// in that case, it cannot be swapped with 2.		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 231 --> 132;
		// operands 1 and 3 (register forms only): 231 --> 213;
		// operands 2 and 3 (register forms only): 231 --> 231(no changes).
defm r231 : fma3p_rm<opc231,		defm r231 : fma3p_rm<opc231,
!strconcat(OpcodeStr, "231", PackTy),		!strconcat(OpcodeStr, "231", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 1>;
} // hasSideEffects = 0		} // hasSideEffects = 0
}		}

// Fused Multiply-Add		// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {		let ExeDomain = SSEPackedSingle in {
defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,		defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
loadv8f32, X86Fmadd, v4f32, v8f32>;		loadv8f32, X86Fmadd, v4f32, v8f32>;
defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,		defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
[(set RC:$dst,		[(set RC:$dst,
(OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;		(OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;

let mayLoad = 1, isCommutable = IsMVariantCommutable in		let mayLoad = 1, isCommutable = IsMVariantCommutable in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),		def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),		(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),		"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
[(set RC:$dst,		[(set RC:$dst,
(OpVT (OpNode RC:$src2, RC:$src1,		(OpVT (OpNode RC:$src2, RC:$src1,
qcolombetUnsubmitted Not Done Reply Inline Actions Ditto. qcolombet: Ditto.
(mem_frag addr:$src3))))]>;		(mem_frag addr:$src3))))]>;
}		}
} // Constraints = "$src1 = $dst"		} // Constraints = "$src1 = $dst"
qcolombetUnsubmitted Not Done Reply Inline Actions Ditto. qcolombet: Ditto.

multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string PT2, Intrinsic Int,		string OpStr, string PackTy, string PT2, Intrinsic Int,
SDNode OpNode, RegisterClass RC, ValueType OpVT,		SDNode OpNode, RegisterClass RC, ValueType OpVT,
X86MemOperand x86memop, Operand memop, PatFrag mem_frag,		X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
ComplexPattern mem_cpat> {		ComplexPattern mem_cpat> {
let hasSideEffects = 0 in {		let hasSideEffects = 0 in {
		// For 132, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 132 --> 231;
		// operands 1 and 3 (register forms only): 132 --> 132(no changes);
		// operands 2 and 3 (register forms only): 132 --> 213.
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),		defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
x86memop, RC, OpVT, mem_frag>;		x86memop, RC, OpVT, mem_frag,
// See the other defm of r231 for the explanation regarding the		/* IsRVariantCommutable */ 1,
// commutable flags.		/* IsMVariantCommutable */ 1>;
		// For 231, both the register and memory variants are commutable.
		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 231 --> 132;
		// operands 1 and 3 (register forms only): 231 --> 213;
		// operands 2 and 3 (register forms only): 231 --> 231(no changes).
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),		defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
x86memop, RC, OpVT, mem_frag,		x86memop, RC, OpVT, mem_frag,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 1>;
}

// See the other defm of r213 for the explanation regarding the		// For 213, both the register and memory variants are commutable.
// commutable flags.		// For the register form the commutable operands are 1, 2 and 3.
		// For the memory variant the folded operand must be in 3. Thus,
		// in that case, only the operands 1 and 2 can be swapped.
		// Commuting some of operands may require the opcode change:
		// operands 1 and 2 (memory & register forms): 213 --> 213(no changes);
		// operands 1 and 3 (register forms only): 213 --> 231;
		// operands 2 and 3 (register forms only): 213 --> 132.
		// Commuting the operand 1 with some other operand changes the upper bits
		// of the result FMA instruction. Thus, it requires a proof of the fact that
		// only the lowest element of the result is used.
defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),		defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
x86memop, RC, OpVT, mem_frag,		x86memop, RC, OpVT, mem_frag,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 1,		/* IsMVariantCommutable */ 1,
OpNode>;		OpNode>;
}		}
		}

multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, Intrinsic IntF32, Intrinsic IntF64,		string OpStr, Intrinsic IntF32, Intrinsic IntF64,
SDNode OpNode> {		SDNode OpNode> {
defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,		defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;		FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,		defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;		FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrInfo.h

Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines	public:
/// For example, calling this method this way:		/// For example, calling this method this way:
/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;		/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
/// findCommutedOpIndices(MI, Op1, Op2);		/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be		/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.		/// commutable with the operand#1.
bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;		unsigned &SrcOpIdx2) const override;

		/// Returns true if the routine could find two commutable operands
		/// in the given FMA instruction. Otherwise, returns false.
		///
		/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
		qcolombetUnsubmitted Not Done Reply Inline Actions \p SrcOpIdx1 and \p SrcOpIdx2 qcolombet: \p SrcOpIdx1 and \p SrcOpIdx2
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. I did not know about \p. Thank you for letting me know. v_klochkov: Fixed. I did not know about \p. Thank you for letting me know.
		/// The output indices of the commuted operands are returned in these
		/// arguments. Also, the input values of these arguments may be preset either
		/// to indices of operands that must be commuted or be equal to a special
		/// value 'CommuteAnyOperandIndex' which means that the corresponding
		/// operand index is not set and this method is free to pick any of
		/// available commutable operands.
		///
		/// For example, calling this method this way:
		/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
		/// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
		/// can be interpreted as a query asking if the operand #1 can be swapped
		/// with any other available operand (e.g. operand #2, operand #3, etc.).
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		bool findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const;

		/// Returns an adjusted FMA opcode that must be used in FMA instruction that
		/// performs the same computations as the given MI but which has the operands
		/// SrcOpIdx1 and SrcOpIdx2 commuted.
		/// It may return 0 if it is unsafe to commute the operands.
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const;

// Branch analysis.		// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr* MI) const override;		bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,		bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,		MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,		SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;		bool AllowModify) const override;

bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,		bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
▲ Show 20 Lines • Show All 254 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,911 Lines • ▼ Show 20 Lines	if (LV) { // Update live variables
if (Dest.isDead())		if (Dest.isDead())
LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);		LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
}		}

MFI->insert(MBBI, NewMI); // Insert the new inst		MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;		return NewMI;
}		}

		/// Returns true if the given instruction opcode is FMA3.
		/// Otherwise, returns false.
		static bool isFMA3(unsigned Opcode) {
		switch (Opcode) {
		case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
		case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
		case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
		case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
		case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
		case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
		case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
		case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:

		case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
		case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
		case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
		case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
		case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
		case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
		case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
		case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:

		case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
		case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
		case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
		case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
		case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
		case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
		case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
		case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:

		case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
		case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
		case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
		case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
		case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
		case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
		case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
		case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:

		case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
		case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
		case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
		case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
		case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
		case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
		case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
		case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
		case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
		case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
		case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
		case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
		case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
		case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
		case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
		case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:

		case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
		case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
		case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
		case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
		case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
		case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
		case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
		case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:

		case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
		case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
		case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
		case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
		case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
		case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
		case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
		case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
		case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
		case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
		case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
		case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
		case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
		case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
		case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
		case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:

		case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
		case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
		case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
		case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
		case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
		case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
		case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
		case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:

		case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
		case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
		case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
		case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
		case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
		case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
		case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
		case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
		case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
		case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
		case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
		case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
		case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
		case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
		case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
		case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
		return true;
		default:
		break;
		qcolombetUnsubmitted Not Done Reply Inline Actions return false; qcolombet: return false;
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
		}
		return false;
		qcolombetUnsubmitted Not Done Reply Inline Actions llvm_unreachable(“Opcode not handled by the switch") qcolombet: llvm_unreachable(“Opcode not handled by the switch")
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Fixed. v_klochkov: Fixed.
		}

MachineInstr X86InstrInfo::commuteInstructionImpl(MachineInstr MI,		MachineInstr X86InstrInfo::commuteInstructionImpl(MachineInstr MI,
bool NewMI,		bool NewMI,
unsigned OpIdx1,		unsigned OpIdx1,
unsigned OpIdx2) const {		unsigned OpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)		case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)		case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)		case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->setDesc(get(Opc));		MI->setDesc(get(Opc));
// Fallthrough intended.		// Fallthrough intended.
}		}
default:		default:
		if (isFMA3(MI->getOpcode())) {
		unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
		if (Opc == 0)
		return nullptr;
		if (NewMI) {
		MachineFunction &MF = *MI->getParent()->getParent();
		MI = MF.CloneMachineInstr(MI);
		NewMI = false;
		}
		MI->setDesc(get(Opc));
		}
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);		return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}		}
}		}

		bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const {

		unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;

		// Only the first RegOpsNum operands are commutable.
		// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
		// that the operand is not specified/fixed.
		if (SrcOpIdx1 < 1 \|\|
		(SrcOpIdx1 > RegOpsNum && SrcOpIdx1 != CommuteAnyOperandIndex) \|\|
		SrcOpIdx2 < 1 \|\|
		(SrcOpIdx2 > RegOpsNum && SrcOpIdx2 != CommuteAnyOperandIndex))
		qcolombetUnsubmitted Not Done Reply Inline Actions I would rewrite the check to make the CommuteAnyOperandIndex the first discrimination: if (SrcOpIdx1 != CommuteAnyOperandIndex && (SrcOpIdx1 < 1 \|\| SrcOpIdx1 > RegOpsNum)) return false; Same for SrcOpIdx2. qcolombet: I would rewrite the check to make the CommuteAnyOperandIndex the first discrimination: if…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions I agree, your version looks a bit more clear. Fixed. v_klochkov: I agree, your version looks a bit more clear. Fixed.
		return false;

		if (SrcOpIdx1 == CommuteAnyOperandIndex \|\|
		SrcOpIdx2 == CommuteAnyOperandIndex) {
		qcolombetUnsubmitted Not Done Reply Inline Actions Add a comment saying that we look for two registers operands, those are the ones that can be commuted regardless of the FMA opcode. We will adjust the opcode later. qcolombet: Add a comment saying that we look for two registers operands, those are the ones that can be…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, I added a comment. v_klochkov: Ok, I added a comment.
		unsigned CommutableOpIdx1 = SrcOpIdx1;
		unsigned CommutableOpIdx2 = SrcOpIdx2;

		// At least one of operands to be commuted is not specified and
		// this method is free to choose appropriate commutable operands.
		if (SrcOpIdx1 == SrcOpIdx2)
		// Both of operands are not fixed. By default set one of commutable
		// operands to the last operand of the instruction.
		qcolombetUnsubmitted Not Done Reply Inline Actions the last register operand of the instruction. qcolombet: the last register operand of the instruction.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions It is interesting that I added the word "register" here when made changes for your previous comment and only then noticed this comment asking me to do exactly the same change. Fixed. v_klochkov: It is interesting that I added the word "register" here when made changes for your previous…
		CommutableOpIdx2 = RegOpsNum;
		else if (SrcOpIdx2 == CommuteAnyOperandIndex)
		// Only one of operands is not fixed.
		CommutableOpIdx2 = SrcOpIdx1;

		// CommutableOpIdx2 is well defined now. Let's choose another commutable
		// operand and assign its index to CommutableOpIdx1.
		unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
		for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
		// The commuted operands must have different registers.
		// Otherwise, the commute transformation does not change anything and
		// is useless then.
		if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
		break;
		}

		// No appropriate commutable operands were found.
		if (CommutableOpIdx1 == 0)
		return false;

		// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
		// to return those values.
		if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
		CommutableOpIdx1, CommutableOpIdx2))
		return false;
		}
		return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
		qcolombetUnsubmitted Not Done Reply Inline Actions Add a comment along the line: // Check if we can adjust the opcode so that the registers we change preserve the semantic. qcolombet: Add a comment along the line: // Check if we can adjust the opcode so that the registers we…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, added a comment. v_klochkov: Ok, added a comment.
		}

		unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const {
		int RetOpc = 0;
		int Opc = MI->getOpcode();

		// Struct describing FMA opcodes and dependencies between them.
		static const struct {
		qcolombetUnsubmitted Not Done Reply Inline Actions Please explain the structure you are using here. In particular, what are those dependencies and how do you represent them. qcolombet: Please explain the structure you are using here. In particular, what are those dependencies and…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions This is just an array after I removed IsScalar property. I changed the comment to make it more clear. v_klochkov: This is just an array after I removed IsScalar property. I changed the comment to make it more…
		int Opc1;
		int Opc2;
		int Opc3;
		bool IsScalar;
		} OpcodeAlts[] = {
		{ X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r, true },
		qcolombetUnsubmitted Done Reply Inline Actions Excellent! qcolombet: Excellent!
		{ X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r, true },
		{ X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r, false },
		{ X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r, false },
		{ X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY,false },
		{ X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY,false },
		{ X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m, true },
		{ X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m, true },
		{ X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m, false },
		{ X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m, false },
		{ X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY,false },
		{ X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY,false },

		{ X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r, true },
		{ X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r, true },
		{ X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r, false },
		{ X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r, false },
		{ X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY,false },
		{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY,false },
		{ X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m, true },
		{ X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m, true },
		{ X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m, false },
		{ X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m, false },
		{ X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY,false },
		{ X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY,false },

		{ X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r, true },
		{ X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r, true },
		{ X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r, false },
		{ X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r, false },
		{ X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY,false },
		{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY,false },
		{ X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m, true },
		{ X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m, true },
		{ X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m, false },
		{ X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m, false },
		{ X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY,false },
		{ X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY,false },

		{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r, true },
		{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r, true },
		{ X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r, false },
		{ X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r, false },
		{ X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY,false },
		{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY,false },
		{ X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m, true },
		{ X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m, true },
		{ X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m, false },
		{ X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m, false },
		{ X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY,false },
		{ X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY,false },

		{ X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r, false },
		{ X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r, false },
		{ X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY,false },
		{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY,false },
		{ X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m, false },
		{ X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m, false },
		{ X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY,false },
		{ X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY,false },

		{ X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r, false },
		{ X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r, false },
		{ X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY,false },
		{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY,false },
		{ X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m, false },
		{ X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m, false },
		{ X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY,false },
		{ X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY,false }
		};

		unsigned char OpcodeAltsNum = sizeof(OpcodeAlts) / sizeof(OpcodeAlts[0]);
		int i, pos = 0;
		for (i = 0; i < OpcodeAltsNum; i++) {
		if (OpcodeAlts[i].Opc2 == Opc) {
		pos = 2;
		qcolombetUnsubmitted Done Reply Inline Actions Maybe put three and adapt the comparison. qcolombet: Maybe put three and adapt the comparison.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, but using 3 instead of 2 required renaming LastFormIndex to FormsNum. v_klochkov: Ok, but using 3 instead of 2 required renaming LastFormIndex to FormsNum.
		break;
		}
		if (OpcodeAlts[i].Opc1 == Opc) {
		qcolombetUnsubmitted Done Reply Inline Actions Initialize FormIndex to LastFromIndex. qcolombet: Initialize FormIndex to LastFromIndex.
		pos = 1;
		qcolombetUnsubmitted Done Reply Inline Actions Use for (GroupIndex = 0; GroupIndex < OpcodeGroupsNum && LastFormIndex != LastFormIndex; GroupIndex++) qcolombet: Use for (GroupIndex = 0; GroupIndex < OpcodeGroupsNum && LastFormIndex != LastFormIndex…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok. BTW, this change required a fix for GroupIndex after the loop (do GroupIndex-- once). v_klochkov: Ok. BTW, this change required a fix for GroupIndex after the loop (do GroupIndex-- once).
		break;
		qcolombetUnsubmitted Done Reply Inline Actions Use < LastFormIndex qcolombet: Use < LastFormIndex
		}
		if (OpcodeAlts[i].Opc3 == Opc) {
		pos = 3;
		break;
		}
		}
		qcolombetUnsubmitted Done Reply Inline Actions Remove this block. qcolombet: Remove this block.

		// Input opcode does not match with any opcode from the table.
		if (pos == 0)
		return 0;
		qcolombetUnsubmitted Done Reply Inline Actions Use == qcolombet: Use ==

		// FIXME: Commuting the 1st operand of scalar FMA requires some additional
		// analysis such as getting proof of the fact that all uses of the
		// given FMA instruction use only the lowest element. Without proving
		// that commuting the 1st operand of scalar FMAs changes the upper bits
		// of the result.
		qcolombetUnsubmitted Done Reply Inline Actions No need for {} qcolombet: No need for {}
		if (OpcodeAlts[i].IsScalar && (SrcOpIdx1 == 1 \|\| SrcOpIdx2 == 1))
		return 0;
		qcolombetUnsubmitted Not Done Reply Inline Actions Get rid of this check. This is a hack to workaround a bug. The bug should be fixed independently of the improvement for the lowering for commutable operands. qcolombet: Get rid of this check. This is a hack to workaround a bug. The bug should be fixed…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions I removed this check from this change-set and updated the OpcodeAlts struct. BTW, I would not call this check a hack. It was rather a pessimistic correctness check. v_klochkov: I removed this check from this change-set and updated the OpcodeAlts struct. BTW, I would not…

		// Find reversed FMA opcode.
		if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 2) \|\|
		qcolombetUnsubmitted Not Done Reply Inline Actions Canonicalize SrcOpIdx1 and SrcOpIdx2 to avoid these duplicated checks. qcolombet: Canonicalize SrcOpIdx1 and SrcOpIdx2 to avoid these duplicated checks.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, Fixed. SrcOpIdx1 has the lowest index now to simplify the next checks. v_klochkov: Ok, Fixed. SrcOpIdx1 has the lowest index now to simplify the next checks.
		(SrcOpIdx1 == 2 && SrcOpIdx2 == 1)) {
		if (pos == 1)
		qcolombetUnsubmitted Done Reply Inline Actions I like the comments! Keep them, but I believe we can refactor the code a bit. What you basically have is given the indices, you map each FormIndex to a new FormIndex and you have three choices each time. It looks to me like we could do save this table statically like static const unsigned Mapping[][3] = { // SrcOpIdx1 == 1 && SrcOpIdx2 == 2 // FMA132 A, C, b; ==> FMA231 C, A, b; // FMA213 B, A, c; ==> FMA213 A, B, c; // FMA231 C, A, b; ==> FMA132 A, C, b; { Form231Index, Form213Index, Form132Index }, // (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) // etc. // (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) }; unsigned Case; if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) Case = 0; else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) Case = 1; else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) Case = 2; else assert(“Invalid commutable indices for FMA”); // or return 0; RetOpc == Mapping[Case][FormIndex]; qcolombet: I like the comments! Keep them, but I believe we can refactor the code a bit. What you…
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions This is a great idea! I did the corresponding changes. v_klochkov: This is a great idea! I did the corresponding changes.
		RetOpc = OpcodeAlts[i].Opc3;
		else if (pos == 2)
		RetOpc = Opc;
		else
		RetOpc = OpcodeAlts[i].Opc1;
		} else if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 3) \|\|
		(SrcOpIdx1 == 3 && SrcOpIdx2 == 1)) {
		if (pos == 1)
		RetOpc = Opc;
		else if (pos == 2)
		RetOpc = OpcodeAlts[i].Opc3;
		else
		RetOpc = OpcodeAlts[i].Opc2;
		} else if ((SrcOpIdx1 == 2 && SrcOpIdx2 == 3) \|\|
		(SrcOpIdx1 == 3 && SrcOpIdx2 == 2)) {
		if (pos == 1)
		RetOpc = OpcodeAlts[i].Opc2;
		else if (pos == 2)
		RetOpc = OpcodeAlts[i].Opc1;
		else
		RetOpc = Opc;
		}

		return RetOpc;
		}

bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,		bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,		unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {		unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::CMPPDrri:		case X86::CMPPDrri:
case X86::CMPPSrri:		case X86::CMPPSrri:
case X86::VCMPPDrri:		case X86::VCMPPDrri:
case X86::VCMPPSrri:		case X86::VCMPPSrri:
case X86::VCMPPDYrri:		case X86::VCMPPDYrri:
case X86::VCMPPSYrri: {		case X86::VCMPPSYrri: {
// Float comparison can be safely commuted for		// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests		// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;		unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {		switch (Imm) {
case 0x00: // EQUAL		case 0x00: // EQUAL
case 0x03: // UNORDERED		case 0x03: // UNORDERED
case 0x04: // NOT EQUAL		case 0x04: // NOT EQUAL
case 0x07: // ORDERED		case 0x07: // ORDERED
// The indices of the commutable operands are 1 and 2.		// The indices of the commutable operands are 1 and 2.
// Assign them to the returned operand indices here.		// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);		return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}		}
return false;		return false;
}		}
case X86::VFMADDPDr231r:
case X86::VFMADDPSr231r:
case X86::VFMADDSDr231r:
case X86::VFMADDSSr231r:
case X86::VFMSUBPDr231r:
case X86::VFMSUBPSr231r:
case X86::VFMSUBSDr231r:
case X86::VFMSUBSSr231r:
case X86::VFNMADDPDr231r:
case X86::VFNMADDPSr231r:
case X86::VFNMADDSDr231r:
case X86::VFNMADDSSr231r:
case X86::VFNMSUBPDr231r:
case X86::VFNMSUBPSr231r:
case X86::VFNMSUBSDr231r:
case X86::VFNMSUBSSr231r:
case X86::VFMADDPDr231rY:
case X86::VFMADDPSr231rY:
case X86::VFMSUBPDr231rY:
case X86::VFMSUBPSr231rY:
case X86::VFNMADDPDr231rY:
case X86::VFNMADDPSr231rY:
case X86::VFNMSUBPDr231rY:
case X86::VFNMSUBPSr231rY:
// The indices of the commutable operands are 2 and 3.
// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
default:		default:
		if (isFMA3(MI->getOpcode()))
		return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);		return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}		}
return false;		return false;
}		}

static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {		static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {		switch (BrOpc) {
default: return X86::COND_INVALID;		default: return X86::COND_INVALID;
▲ Show 20 Lines • Show All 3,492 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/fma-commute-x86.ll

				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s
				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s
				; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s

				declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				RKSimonUnsubmitted Not Done Reply Inline Actions Please can you regenerate with update_llc_test_checks.py to ensure that no extra mov or other instructions are being included. RKSimon: Please can you regenerate with update_llc_test_checks.py to ensure that no extra mov or other…
				define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmadd213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmadd213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmadd213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmadd213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}



				declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmadd213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmadd213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmadd213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmadd213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fmsub213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fmsub213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fmsub213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fmsub213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub132ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub231ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
				; CHECK: fnmsub213ps {{.%r.}}, %xmm0, %xmm0
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub132ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub231ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
				; CHECK: fnmsub213ps {{.%r.}}, %ymm0, %ymm0
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub132pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub231pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
				; CHECK: fnmsub213pd {{.%r.}}, %xmm0, %xmm0
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub132pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub231pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
				; CHECK: fnmsub213pd {{.%r.}}, %ymm0, %ymm0
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

llvm/test/CodeGen/X86/fma_patterns.ll

Show First 20 Lines • Show All 170 Lines • ▼ Show 20 Lines	; CHECK_FMA4-NEXT: retq
%x = fmul <2 x double> %a0, %a1		%x = fmul <2 x double> %a0, %a1
%res = fsub <2 x double> %x, %a2		%res = fsub <2 x double> %x, %a2
ret <2 x double> %res		ret <2 x double> %res
}		}

define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {		define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
; CHECK-LABEL: test_x86_fnmadd_ss:		; CHECK-LABEL: test_x86_fnmadd_ss:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0		; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
		; CHECK-NEXT: vmovaps %xmm1, %xmm0
		qcolombetUnsubmitted Not Done Reply Inline Actions We shouldn’t regress those. qcolombet: We shouldn’t regress those.
		v_klochkovAuthorUnsubmitted Not Done Reply Inline Actions Ok, Fixed. v_klochkov: Ok, Fixed.
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fnmadd_ss:		; CHECK_FMA4-LABEL: test_x86_fnmadd_ss:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = fmul float %a0, %a1		%x = fmul float %a0, %a1
%res = fsub float %a2, %x		%res = fsub float %a2, %x
ret float %res		ret float %res
}		}

define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {		define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
; CHECK-LABEL: test_x86_fnmadd_sd:		; CHECK-LABEL: test_x86_fnmadd_sd:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0		; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
		; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fnmadd_sd:		; CHECK_FMA4-LABEL: test_x86_fnmadd_sd:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = fmul double %a0, %a1		%x = fmul double %a0, %a1
%res = fsub double %a2, %x		%res = fsub double %a2, %x
ret double %res		ret double %res
}		}

define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {		define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
; CHECK-LABEL: test_x86_fmsub_sd:		; CHECK-LABEL: test_x86_fmsub_sd:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0		; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
		; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fmsub_sd:		; CHECK_FMA4-LABEL: test_x86_fmsub_sd:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = fmul double %a0, %a1		%x = fmul double %a0, %a1
%res = fsub double %x, %a2		%res = fsub double %x, %a2
ret double %res		ret double %res
}		}

define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {		define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
; CHECK-LABEL: test_x86_fnmsub_ss:		; CHECK-LABEL: test_x86_fnmsub_ss:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0		; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
		; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fnmsub_ss:		; CHECK_FMA4-LABEL: test_x86_fnmsub_ss:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = fsub float -0.000000e+00, %a0		%x = fsub float -0.000000e+00, %a0
%y = fmul float %x, %a1		%y = fmul float %x, %a1
%res = fsub float %y, %a2		%res = fsub float %y, %a2
ret float %res		ret float %res
}		}

define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {		define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_x86_fmadd_ps_load:		; CHECK-LABEL: test_x86_fmadd_ps_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2		; CHECK-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
; CHECK-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load:		; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0		%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1		%y = fmul <4 x float> %x, %a1
%res = fadd <4 x float> %y, %a2		%res = fadd <4 x float> %y, %a2
ret <4 x float> %res		ret <4 x float> %res
}		}

define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {		define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_x86_fmsub_ps_load:		; CHECK-LABEL: test_x86_fmsub_ps_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2		; CHECK-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0
; CHECK-NEXT: vfmsub213ps %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load:		; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0		%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1		%y = fmul <4 x float> %x, %a1
▲ Show 20 Lines • Show All 188 Lines • ▼ Show 20 Lines
;		;
; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))		; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
;		;

define float @test_f32_interp(float %x, float %y, float %t) {		define float @test_f32_interp(float %x, float %y, float %t) {
; CHECK-LABEL: test_f32_interp:		; CHECK-LABEL: test_f32_interp:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1		; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
; CHECK-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0		; CHECK-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2
		; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_f32_interp:		; CHECK_FMA4-LABEL: test_f32_interp:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1		; CHECK_FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
; CHECK_FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%t1 = fsub float 1.0, %t		%t1 = fsub float 1.0, %t
Show All 40 Lines	; CHECK_FMA4-NEXT: retq
%r = fadd <8 x float> %tx, %ty		%r = fadd <8 x float> %tx, %ty
ret <8 x float> %r		ret <8 x float> %r
}		}

define double @test_f64_interp(double %x, double %y, double %t) {		define double @test_f64_interp(double %x, double %y, double %t) {
; CHECK-LABEL: test_f64_interp:		; CHECK-LABEL: test_f64_interp:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1		; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
; CHECK-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0		; CHECK-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2
		; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_f64_interp:		; CHECK_FMA4-LABEL: test_f64_interp:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1		; CHECK_FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
; CHECK_FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%t1 = fsub double 1.0, %t		%t1 = fsub double 1.0, %t
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

Improved X86-FMA3 mem-folding & coalescingClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 36032

llvm/lib/Target/X86/X86InstrFMA.td

llvm/lib/Target/X86/X86InstrInfo.h

llvm/lib/Target/X86/X86InstrInfo.cpp

llvm/test/CodeGen/X86/fma-commute-x86.ll

llvm/test/CodeGen/X86/fma_patterns.ll

Improved X86-FMA3 mem-folding & coalescing
ClosedPublic