This is an archive of the discontinued LLVM Phabricator instance.

Improved X86-FMA3 mem-folding & coalescing
ClosedPublic

Authored by v_klochkov on Sep 29 2015, 1:50 PM.

Download Raw Diff

Details

Reviewers

qcolombet
ab

Commits

rG4731bea3e5e4: Improved the operands commute transformation for X86-FMA3 instructions. All 3…
rL252335: Improved the operands commute transformation for X86-FMA3 instructions.

Summary

`
This change-set was initially included into a bigger change-set http://reviews.llvm.org/D11370
but X86 FMA3 specific changes were removed from D11370 to simplify that change-set.

The changes proposed here implement optimal form selection (213/312/231)
for X86 FMA3 instructions, and help to improve Memory-operand folding and Coalescing
optimizations performed for X86 FMA instructions.

Better Memory-folding and Coalescing optimizations help to reduce
registers pressure. Improvement from the changes can be shown on such
an example:

    for (int i = 0; i < N; i += 1) {
        val1 = _mm_and_pd(val1, val5);
        val2 = _mm_and_pd(val2, val6);
        val3 = _mm_and_pd(val3, val7);
        val4 = _mm_and_pd(val4, val8);
        val5 = _mm_xor_pd(val1, val5);
        val6 = _mm_xor_pd(val2, val6);
        val7 = _mm_xor_pd(val3, val7);
        val8 = _mm_xor_pd(val4, val8);

		v_accu1 = _mm_fmadd_pd(v_accu1, x1_arr[i], val1);
        v_accu2 = _mm_fmadd_pd(v_accu2, x2_arr[i], val2);
        v_accu3 = _mm_fmadd_pd(v_accu3, x3_arr[i], val3);
        v_accu4 = _mm_fmadd_pd(v_accu4, x4_arr[i], val4);
        v_accu5 = _mm_fmadd_pd(v_accu5, x5_arr[i], val5);
        v_accu6 = _mm_fmadd_pd(v_accu6, x6_arr[i], val6);
        v_accu7 = _mm_fmadd_pd(v_accu7, x7_arr[i], val7);
        v_accu8 = _mm_fmadd_pd(v_accu8, x8_arr[i], val8);
    }


    ASM code BEFORE the changes:
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovapd %xmm0, -56(%rsp)        # 16-byte Spill
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm12, %xmm5
        vandpd  %xmm6, %xmm9, %xmm6
        vmovapd -40(%rsp), %xmm10       # 16-byte Reload
        vandpd  %xmm10, %xmm13, %xmm10
        vmovapd %xmm10, -40(%rsp)       # 16-byte Spill
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm12, %xmm12
        vxorpd  %xmm6, %xmm9, %xmm9
        vxorpd  %xmm10, %xmm13, %xmm13
        vmovapd %xmm8, %xmm0
        vmovapd x1_arr+8192(%rcx), %xmm8
        vmovapd -24(%rsp), %xmm1        # 16-byte Reload
        vfmadd213pd     %xmm7, %xmm8, %xmm1
        vmovapd %xmm1, -24(%rsp)        # 16-byte Spill
        vmovapd %xmm0, %xmm8
        vmovapd x2_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm5, %xmm1, %xmm4
        vmovapd x3_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm6, %xmm1, %xmm8
        vmovapd x4_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm10, %xmm1, %xmm11
        vmovapd -56(%rsp), %xmm0        # 16-byte Reload
        vmovapd x5_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm3, %xmm1, %xmm15
        vmovapd x6_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm12, %xmm1, %xmm0
        vmovapd x7_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm9, %xmm1, %xmm2
        vmovapd x8_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm13, %xmm1, %xmm14
        addq    $16, %rcx
        jne     .LBB1_2

        ASM code WITH the new changes (about 30% faster):
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm2, %xmm5
        vandpd  %xmm6, %xmm0, %xmm6
        vandpd  %xmm1, %xmm4, %xmm1
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm2, %xmm2
        vxorpd  %xmm6, %xmm0, %xmm0
        vfmadd132pd     x1_arr+8192(%rcx), %xmm7, %xmm15
        vfmadd132pd     x2_arr+8192(%rcx), %xmm5, %xmm8
        vfmadd132pd     x3_arr+8192(%rcx), %xmm6, %xmm9
        vfmadd132pd     x4_arr+8192(%rcx), %xmm1, %xmm10
        vfmadd132pd     x5_arr+8192(%rcx), %xmm3, %xmm14
        vfmadd132pd     x6_arr+8192(%rcx), %xmm2, %xmm11
        vfmadd132pd     x7_arr+8192(%rcx), %xmm0, %xmm12
        vxorpd  %xmm1, %xmm4, %xmm4
        vfmadd132pd     x8_arr+8192(%rcx), %xmm4, %xmm13
        addq    $16, %rcx
        jne     .LBB1_2

This change-set also fixed an existing correctness problem caused
by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics.

   
   For FMA intrinsic call:

       __m128d foo(__m128d a, __m128d b, __m128d c) {
	     // must return XMM0={b[127:64], a[63:0]*b[63:0]+c[63:0]}
		 // but currently returned value is XMM0={a[127:64], a[63:0]*b[63:0]+c[63:0]}
	     return _mm_fmadd_sd(b, a, c);
	   }

The Coalescer/TwoAddressInstructionPass swapped the 1st and 2nd operands
of SCALAR FMA and invalidated the higher bits of the result returned
from foo().
The change-set fixes that and prohibits swapping 1st and 2nd operands
of scalar FMAs.

Swapping 1st and 2nd operands of scalar FMAs may be possible and legal,
but only after special analysis of FMA users. Such optimization/analysis
can be implemented separately.
Another way is to separate FMA opcodes generated for FP operations
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.

With regards to performance these changes are very good for vector/packed FMAs
(all source operands became commutable),
and neutral for scalar FMAs:
a) prohibit/disable commuting 1st and 2nd operands,
b) enable commuting 2nd and 3rd operands.
`

Diff Detail

Repository: rL LLVM

Event Timeline

v_klochkov updated this revision to Diff 36032.Sep 29 2015, 1:50 PM

v_klochkov retitled this revision from to Improved X86-FMA3 mem-folding & coalescing.

v_klochkov updated this object.

v_klochkov added a reviewer: qcolombet.

v_klochkov added a subscriber: llvm-commits.

Another way is to separate FMA opcodes generated for FP operations
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.

You're right, _Int would work (and is intended for exactly this situation), but I disagree that we can avoid fixing that here. I'm probably the one who hates _Int the most, but currently, the fma scalar intrinsic patterns seem just plain wrong, and working around that here isn't proper, IMHO. You should add the _Int instructions before landing this patch.

As for getting rid of _Int in the long term, we have https://llvm.org/bugs/show_bug.cgi?id=23449 !

This revision now requires changes to proceed.Sep 30 2015, 2:36 PM

Hi Slava,

Thanks on working on this.

Two main things:

Could you explain the structure you used to describe the dependencies between the FMA opcode?

I do not want to reverse engineer it to review the patch!

The bug you fix here for the operand we shouldn’t commute for the intrinsic lowering is, IMO, separated from improving the lowering for the commutable operand.

I.e., please address that in a separate patch. As for direction, you could use the *_Int approach, or better, but more involved, model correctly the intrinsic lowering by adding a subregister for FP32 from the VR128 and use insert_subreg.

Cheers,
-Quentin

llvm/lib/Target/X86/X86InstrInfo.cpp
3030 ↗	(On Diff #36032)	return false;
3032 ↗	(On Diff #36032)	llvm_unreachable(“Opcode not handled by the switch")
3272 ↗	(On Diff #36032)	I would rewrite the check to make the CommuteAnyOperandIndex the first discrimination: if (SrcOpIdx1 != CommuteAnyOperandIndex && (SrcOpIdx1 < 1 \|\| SrcOpIdx1 > RegOpsNum)) return false; Same for SrcOpIdx2.
3276 ↗	(On Diff #36032)	Add a comment saying that we look for two registers operands, those are the ones that can be commuted regardless of the FMA opcode. We will adjust the opcode later.
3284 ↗	(On Diff #36032)	the last register operand of the instruction.
3311 ↗	(On Diff #36032)	Add a comment along the line: // Check if we can adjust the opcode so that the registers we change preserve the semantic.
3321 ↗	(On Diff #36032)	Please explain the structure you are using here. In particular, what are those dependencies and how do you represent them.
3425 ↗	(On Diff #36032)	Get rid of this check. This is a hack to workaround a bug. The bug should be fixed independently of the improvement for the lowering for commutable operands.
3428 ↗	(On Diff #36032)	Canonicalize SrcOpIdx1 and SrcOpIdx2 to avoid these duplicated checks.
llvm/lib/Target/X86/X86InstrInfo.h
270 ↗	(On Diff #36032)	\p SrcOpIdx1 and \p SrcOpIdx2
llvm/test/CodeGen/X86/fma_patterns.ll
180 ↗	(On Diff #36032)	We shouldn’t regress those.

cameron.mcinally added a subscriber: cameron.mcinally.Sep 30 2015, 8:43 PM

Ahmed, Quentin,
Thank you for the quick code-review.

I am ok with having the correctness fix for FMAs to be arranged as a separate change-set.
The correctness fix is removed from this change-set.

Also, I did some additional changes + renaming + documenting in
getFMA3OpcodeToCommuteOperands() to make the code look simpler.

I would like to land this fix and then to work on the correctness problem
that exists for scalar FMA intrinsics.

The simplest way is to add *_Int opcodes.

I am not sure I understood this idea

("adding a subregister for FP32 from the VR128 and use insert_subreg")

If there is a precedence (i.e. some similar scalar SIMD instruction) where that approach is used,
then I can try using that approach for FMAs.

Thank you,
Slava

I also did additional changes accordingly to reviewers' recommendations.

llvm/lib/Target/X86/X86InstrInfo.cpp
3030 ↗	(On Diff #36032)	Fixed.
3032 ↗	(On Diff #36032)	Fixed.
3272 ↗	(On Diff #36032)	I agree, your version looks a bit more clear. Fixed.
3276 ↗	(On Diff #36032)	Ok, I added a comment.
3284 ↗	(On Diff #36032)	It is interesting that I added the word "register" here when made changes for your previous comment and only then noticed this comment asking me to do exactly the same change. Fixed.
3311 ↗	(On Diff #36032)	Ok, added a comment.
3321 ↗	(On Diff #36032)	This is just an array after I removed IsScalar property. I changed the comment to make it more clear.
3425 ↗	(On Diff #36032)	I removed this check from this change-set and updated the OpcodeAlts struct. BTW, I would not call this check a hack. It was rather a pessimistic correctness check.
3428 ↗	(On Diff #36032)	Ok, Fixed. SrcOpIdx1 has the lowest index now to simplify the next checks.
llvm/lib/Target/X86/X86InstrInfo.h
270 ↗	(On Diff #36032)	Fixed. I did not know about \p. Thank you for letting me know.
llvm/test/CodeGen/X86/fma_patterns.ll
180 ↗	(On Diff #36032)	Ok, Fixed.

Hi Slava,

Almost there. I think we would benefit from a bit of refactoring.
Tell me if you disagree.

Thanks,
-Quentin

llvm/lib/Target/X86/X86InstrInfo.cpp
3327 ↗	(On Diff #36312)	Excellent!
3402 ↗	(On Diff #36312)	Maybe put three and adapt the comparison.
3405 ↗	(On Diff #36312)	Initialize FormIndex to LastFromIndex.
3406 ↗	(On Diff #36312)	Use for (GroupIndex = 0; GroupIndex < OpcodeGroupsNum && LastFormIndex != LastFormIndex; GroupIndex++)
3407 ↗	(On Diff #36312)	Use < LastFormIndex
3413 ↗	(On Diff #36312)	Remove this block.
3417 ↗	(On Diff #36312)	Use ==
3423 ↗	(On Diff #36312)	No need for {}
3430 ↗	(On Diff #36312)	I like the comments! Keep them, but I believe we can refactor the code a bit. What you basically have is given the indices, you map each FormIndex to a new FormIndex and you have three choices each time. It looks to me like we could do save this table statically like static const unsigned Mapping[][3] = { // SrcOpIdx1 == 1 && SrcOpIdx2 == 2 // FMA132 A, C, b; ==> FMA231 C, A, b; // FMA213 B, A, c; ==> FMA213 A, B, c; // FMA231 C, A, b; ==> FMA132 A, C, b; { Form231Index, Form213Index, Form132Index }, // (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) // etc. // (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) }; unsigned Case; if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) Case = 0; else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) Case = 1; else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) Case = 2; else assert(“Invalid commutable indices for FMA”); // or return 0; RetOpc == Mapping[Case][FormIndex];

Additional minor changes + code-restructuring in getFMA3OpcodeToCommuteOperands().

Hi Quentin,

Thank you for the very useful comments.
I updated the change-set. Please check the additional changes.

Thank you,
Slava

llvm/lib/Target/X86/X86InstrInfo.cpp
3406 ↗	(On Diff #36570)	Ok. BTW, this change required a fix for GroupIndex after the loop (do GroupIndex-- once).
3402 ↗	(On Diff #36312)	Ok, but using 3 instead of 2 required renaming LastFormIndex to FormsNum.
3430 ↗	(On Diff #36312)	This is a great idea! I did the corresponding changes.

Hi Slava,

LGTM.

Regarding fixing the commute bug, forget about the sub register thing I was talking about. There is no precedence and on a second thought I am not sure it is the right to do. In other words, stick to the *_Int approach when you’ll get to it.

Thanks,
-Quentin

RKSimon added a subscriber: RKSimon.Oct 10 2015, 7:15 AM

RKSimon added inline comments.

llvm/test/CodeGen/X86/fma-commute-x86.ll
4 ↗	(On Diff #36570)	Please can you regenerate with update_llc_test_checks.py to ensure that no extra mov or other instructions are being included.

Hi Quentin,

I do not have permissions for committing changes to LLVM trunc.
Andy (akaylor) who committed the changes for me previously reasonable refused to commit
this change-set as it is now because of Ahmed Bougacha’s comment:

“IMHO” Implement FMA*_Int instructions before landing this change-set.”

That is probably good move as the current fix potentially can produce regressions on tests where the operand 1 and 3 of scalar FMA intrinsics should not be commuted but can be commuted by the new changes.
Also, my senior colleges recommended to implement FMA*_Int instructions first too.

I am ready to submit a new code-review tracker for FMA*_Int instructions.
The corresponding changes are already implemented and tested locally.

Adding FMA*_Int instructions will cause conflicts in X86InstrFMA.td file.

Do you agree with the following plan?

To add FMA*_Int instructions first;
To update this change-set (D13269), i.e. to resolve conflicts in X86InstrFMA.td, and ask for additional review for changes in that file.

In order to minimize the patch for (1), the commute optimization for FMA*_Int should be implemented as a separate change-set.

I am asking you as Ahmed stopped answering in this code-review tracker and also did not answer
to e-mails sent to him separately.

Thanks you,
Slava

Slava,

That plan sounds good to me; I think that's what both Quentin and myself imagined in our original replies.

Please go ahead and submit the patches for _Int FMA instructions.
As for this review, I'll second Simon's comment regarding the tests: please use the utils/update_llc_test_checks.py script. It helps generate CHECKS lines in a way that's both strict and maintainable.

Thanks for working on this, and sorry for the delay!
-Ahmed

v_klochkov mentioned this in D13710: New X86 FMA3*_Int opcodes for scalar FMA intrinsics..Oct 13 2015, 4:00 PM

Hi Ahmed,

Thank you for the answer.

I have submitted a new patch (FMA**_Int opcodes) for code-review.
http://reviews.llvm.org/D13710
Please see (D13710) for details.

So, this (D132269) patch is waiting for code-review/approval/checkin of the changes for (D13710) now.

Thank you,
Slava

Resolved the conflicts in X86InstrFMA.td and updated the fma-commute-x86.ll test.

Hi,

I created the FMA*_Int opcodes in the patch for ( D13710 ) and it has been committed to LLVM trunc.
That patch conflicted with the changes I did here in X86InstrFMA.td.
Thus, I had to update my local workspace and upload the new RE-BASED patch this time.

Please review the updated changes in X86InstrFMA.td.
I would like to comment some additional changes I did to resolve the conflicts and to simplify the opcode definitions:

The parameters 'IsRVariantCommutable' and 'IsMVariantCommutable' were just removed because otherwise, they would be always set to 1.
Moved some comments from fma3{p,s}_forms multiclasses to fma3{p,s}_rm multiclasses. (the multiclasses fma3{p,s}_forms stopped mentioning the commute features, so having those comments there seemed not quite appropriate).

Also, reviewers asked me to use update_llc_test_checks.py for the new test fma-commute-x86.ll.
For some unknown reasons that tool did not work for me, it printed error for any input test.
So, I just added the better checks to the test manually.

This patch does not implement commute transformations for FMA*_Int opcodes.
That can/should be done in a separate patch.

Thank you,
Slava

qcolombet added inline comments.Nov 5 2015, 1:41 PM

llvm/lib/Target/X86/X86InstrFMA.td
37 ↗	(On Diff #39395)	Why are you changing the hasSideEffects semantic here? I am not saying it is false, it just seems out of scope of what is necessary for this patch. What I am missing?
57 ↗	(On Diff #39395)	Although that is not incorrect, why do you get rid of that {? You did the same later in that patch. Again, this is not wrong, but this kind of clean-ups adds noise to the review. I.e., if you want to do such clean-ups, which I am not sure I agree, that should be a separate patch.
162 ↗	(On Diff #39395)	Ditto.
194 ↗	(On Diff #39395)	Ditto.

Please accept my apologies for adding noise to the code-review process.
Ok, no more cleaup-ups at the late code-review phases in future.

I could cancel the clean-up changes that I did, but I am not sure that will help to the review process now, i.e. when I already uploaded them.
If I should remove the clean-ups, no problems, I'll do that. Please let me know.

Regarding the '{' and '}' used for 'let' statements and removed from Nov05 patch...
Quentin, from you comment I did not understand if you disagree with removal of the braces.
Please let me know if the braces should be restored, I'll update the patch.

Thank you,
Slava

llvm/lib/Target/X86/X86InstrFMA.td
37 ↗	(On Diff #39395)	If compare the patches from Nov05 and Oct05, then I just moved the setting of 'hasSideEffects = 0' from line 64 (Oct05) to here. That made the fma3p_forms multiclass more compact. It did not change semantics comparing to Oct05-patch. Sorry if it added noise to the code-review process.
57 ↗	(On Diff #39395)	I am sorry for adding noise to the code-review process by doing clean ups in patches fixing already uploaded patches, i.e. like I did here. So, I shouldn't have removed 'IsMVariantCommutable' and 'IsRvariantCommutable'. After I already removed those parameters, and set isCommutable = 1 at the line 18, I had to choose between: a) to update the line 57 (Oct05 rev): "// Constraints = "$src1 = $dst" --> "// Constraints = "$src1 = $dst, isCommutable = 1" or b) to just remove the line 57 and eliminate the need in updating it in future when/if the 'let' statement above is changed again. Both variants required minor changes and I chose (b). Do you recommend restoring the '{' and '} // ....' used for 'let' statements?

Hi Slava,

Thanks for the clarifications.
And don’t worry about the “noise” part, I was just mentioning it for future reference :).

from you comment I did not understand if you disagree with removal of the braces.

Heh, that’s the thing, I had mixed feeling about that, but now, I have made my mind and I am fine with the patch with the additional clean-ups you did.

Thanks for your patience and all your work on that!

Do you have commit access now or do you want me to commit on your behalf?

Cheers,
-Quentin

llvm/lib/Target/X86/X86InstrFMA.td
37 ↗	(On Diff #39395)	Oh, I was comparing from the initial source to the current update, i.e., unless I am mistaken there is a semantic change: defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; That being said, I believe the change is correct and I am fine with it being embedded in this patch.
57 ↗	(On Diff #39395)	That’s fine, we can remove those.

Closed by commit rL252335: Improved the operands commute transformation for X86-FMA3 instructions. (authored by akaylor). · Explain WhyNov 6 2015, 11:49 AM

This revision was automatically updated to reflect the committed changes.

v_klochkov mentioned this in D14550: X86-FMA3: Implemented commute transformations for FMA*_Int instructions.Nov 10 2015, 12:43 PM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86InstrFMA.td

98 lines

X86InstrInfo.h

40 lines

X86InstrInfo.cpp

353 lines

test/

CodeGen/

X86/

fma-commute-x86.ll

506 lines

fma_patterns.ll

21 lines

Diff 39573

llvm/trunk/lib/Target/X86/X86InstrFMA.td

	Show All 9 Lines
	// This file describes FMA (Fused Multiply-Add) instructions.			// This file describes FMA (Fused Multiply-Add) instructions.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// FMA3 - Intel 3 operand Fused Multiply-Add instructions			// FMA3 - Intel 3 operand Fused Multiply-Add instructions
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	let Constraints = "$src1 = $dst" in {			// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
				// below, both the register and memory variants are commutable.
				// For the register form the commutable operands are 1, 2 and 3.
				// For the memory variant the folded operand must be in 3. Thus,
				// in that case, only the operands 1 and 2 can be swapped.
				// Commuting some of operands may require the opcode change.
				// FMA213:
				// operands 1 and 2 (memory & register forms): 213 --> 213(no changes);
				// operands 1 and 3 (register forms only): 213 --> 231;
				// operands 2 and 3 (register forms only): 213 --> 132.
				// FMA132:
				// operands 1 and 2 (memory & register forms): 132 --> 231;
				// operands 1 and 3 (register forms only): 132 --> 132(no changes);
				// operands 2 and 3 (register forms only): 132 --> 213.
				// FMA231:
				// operands 1 and 2 (memory & register forms): 231 --> 132;
				// operands 1 and 3 (register forms only): 231 --> 213;
				// operands 2 and 3 (register forms only): 231 --> 231(no changes).

				let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
	multiclass fma3p_rm<bits<8> opc, string OpcodeStr,			multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
	PatFrag MemFrag128, PatFrag MemFrag256,			PatFrag MemFrag128, PatFrag MemFrag256,
	ValueType OpVT128, ValueType OpVT256,			ValueType OpVT128, ValueType OpVT256,
	bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
	SDPatternOperator Op = null_frag> {			SDPatternOperator Op = null_frag> {
	let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in			let usesCustomInserter = 1 in
	def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),			def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, VR128:$src3),			(ins VR128:$src1, VR128:$src2, VR128:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set VR128:$dst, (OpVT128 (Op VR128:$src2,			[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
	VR128:$src1, VR128:$src3)))]>;			VR128:$src1, VR128:$src3)))]>;

	let mayLoad = 1, isCommutable = IsMVariantCommutable in			let mayLoad = 1 in
	def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),			def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, f128mem:$src3),			(ins VR128:$src1, VR128:$src2, f128mem:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,			[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
	(MemFrag128 addr:$src3))))]>;			(MemFrag128 addr:$src3))))]>;

	let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in			let usesCustomInserter = 1 in
	def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),			def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2, VR256:$src3),			(ins VR256:$src1, VR256:$src2, VR256:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,			[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
	VR256:$src3)))]>, VEX_L;			VR256:$src3)))]>, VEX_L;

	let mayLoad = 1, isCommutable = IsMVariantCommutable in			let mayLoad = 1 in
	def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),			def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2, f256mem:$src3),			(ins VR256:$src1, VR256:$src2, f256mem:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set VR256:$dst,			[(set VR256:$dst,
	(OpVT256 (Op VR256:$src2, VR256:$src1,			(OpVT256 (Op VR256:$src2, VR256:$src1,
	(MemFrag256 addr:$src3))))]>, VEX_L;			(MemFrag256 addr:$src3))))]>, VEX_L;
	}			}
	} // Constraints = "$src1 = $dst"

	multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,			multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
	string OpcodeStr, string PackTy,			string OpcodeStr, string PackTy,
	PatFrag MemFrag128, PatFrag MemFrag256,			PatFrag MemFrag128, PatFrag MemFrag256,
	SDNode Op, ValueType OpTy128, ValueType OpTy256> {			SDNode Op, ValueType OpTy128, ValueType OpTy256> {
	// For 213, both the register and memory variant are commutable.
	// Indeed, the commutable operands are 1 and 2 and both live in registers
	// for both variants.
	defm r213 : fma3p_rm<opc213,			defm r213 : fma3p_rm<opc213,
	!strconcat(OpcodeStr, "213", PackTy),			!strconcat(OpcodeStr, "213", PackTy),
	MemFrag128, MemFrag256, OpTy128, OpTy256,			MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
	/* IsRVariantCommutable */ 1,
	/* IsMVariantCommutable */ 1,
	Op>;
	let hasSideEffects = 0 in {
	defm r132 : fma3p_rm<opc132,			defm r132 : fma3p_rm<opc132,
	!strconcat(OpcodeStr, "132", PackTy),			!strconcat(OpcodeStr, "132", PackTy),
	MemFrag128, MemFrag256, OpTy128, OpTy256>;			MemFrag128, MemFrag256, OpTy128, OpTy256>;
	// For 231, only the register variant is commutable.
	// For the memory variant the folded operand must be in 3. Thus,
	// in that case, it cannot be swapped with 2.
	defm r231 : fma3p_rm<opc231,			defm r231 : fma3p_rm<opc231,
	!strconcat(OpcodeStr, "231", PackTy),			!strconcat(OpcodeStr, "231", PackTy),
	MemFrag128, MemFrag256, OpTy128, OpTy256,			MemFrag128, MemFrag256, OpTy128, OpTy256>;
	/* IsRVariantCommutable */ 1,
	/* IsMVariantCommutable */ 0>;
	} // hasSideEffects = 0
	}			}

	// Fused Multiply-Add			// Fused Multiply-Add
	let ExeDomain = SSEPackedSingle in {			let ExeDomain = SSEPackedSingle in {
	defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,			defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
	loadv8f32, X86Fmadd, v4f32, v8f32>;			loadv8f32, X86Fmadd, v4f32, v8f32>;
	defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,			defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
	loadv8f32, X86Fmsub, v4f32, v8f32>;			loadv8f32, X86Fmsub, v4f32, v8f32>;
	Show All 28 Lines
	let ExeDomain = SSEPackedDouble in {			let ExeDomain = SSEPackedDouble in {
	defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,			defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
	loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;			loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
	defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",			defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
	loadv2f64, loadv4f64, X86Fnmsub, v2f64,			loadv2f64, loadv4f64, X86Fnmsub, v2f64,
	v4f64>, VEX_W;			v4f64>, VEX_W;
	}			}

	// All source register operands of FMA instructions can be commuted.			// All source register operands of FMA opcodes defined in fma3s_rm multiclass
	// In many cases such commute transformation requres an opcode adjustment,			// can be commuted. In many cases such commute transformation requres an opcode
	// for example, commuting the operands 1 and 2 in FMA*132 form would require			// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
	// an opcode change to FMA*231:			// would require an opcode change to FMA*231:
	// FMA132 reg1, reg2, reg3; // reg1 * reg3 + reg2;			// FMA132 reg1, reg2, reg3; // reg1 * reg3 + reg2;
	// -->			// -->
	// FMA231 reg2, reg1, reg3; // reg1 * reg3 + reg2;			// FMA231 reg2, reg1, reg3; // reg1 * reg3 + reg2;
	// Currently, the commute transformation is supported for only few FMA forms.			// Please see more detailed comment at the very beginning of the section
	// That is the reason why \p IsRVariantCommutable and \p IsMVariantCommutable			// defining FMA3 opcodes above.
	// parameters are used here.			let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
	// The general commute operands optimization working for all forms is going
	// to be implemented soon. (Please, see http://reviews.llvm.org/D13269
	// for details).
	let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
	multiclass fma3s_rm<bits<8> opc, string OpcodeStr,			multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
	X86MemOperand x86memop, RegisterClass RC,			X86MemOperand x86memop, RegisterClass RC,
	bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
	SDPatternOperator OpNode = null_frag> {			SDPatternOperator OpNode = null_frag> {
	let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in			let usesCustomInserter = 1 in
	def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),			def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, RC:$src3),			(ins RC:$src1, RC:$src2, RC:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;			[(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;

	let mayLoad = 1, isCommutable = IsMVariantCommutable in			let mayLoad = 1 in
	def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),			def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, x86memop:$src3),			(ins RC:$src1, RC:$src2, x86memop:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[(set RC:$dst,			[(set RC:$dst,
	(OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;			(OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
	}			}
	} // Constraints = "$src1 = $dst", hasSideEffects = 0

	// These FMA*_Int instructions are defined specially for being used when			// These FMA*_Int instructions are defined specially for being used when
	// the scalar FMA intrinsics are lowered to machine instructions, and in that			// the scalar FMA intrinsics are lowered to machine instructions, and in that
	// sence they are similar to existing ADD_Int, SUB_Int, MUL*_Int, etc.			// sence they are similar to existing ADD_Int, SUB_Int, MUL*_Int, etc.
	// instructions.			// instructions.
	//			//
	// The FMA*_Int instructions are _TEMPORARILY_ defined as NOT commutable.			// FIXME: The FMA*_Int instructions are TEMPORARILY defined as NOT commutable.
	// The upper bits of the result of scalar FMA intrinsics must be copied from			// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
	// the upper bits of the 1st operand. So, commuting the 1st operand would			// and the corresponding optimization has been developed (please see
	// invalidate the upper bits of the intrinsic result.			// http://reviews.llvm.org/D13269 for details). The optimization though needs
	// The corresponding optimization which allows commuting 2nd and 3rd operands			// some minor tuning to enable it for FMA*_Int opcodes.
	// of FMA*_Int instructions has been developed and is waiting for			// Commuting the 1st operand of FMA*_Int requires some additional analysis,
	// code-review approval and checkin (Please see http://reviews.llvm.org/D13269).			// the commute optimization is legal only if all users of FMA*_Int use only
				// the lowest element of the FMA*_Int instruction.
	let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,			let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
	hasSideEffects = 0 in {			hasSideEffects = 0 in
	multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,			multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
	Operand memopr, RegisterClass RC> {			Operand memopr, RegisterClass RC> {
	def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),			def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, RC:$src3),			(ins RC:$src1, RC:$src2, RC:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[]>;			[]>;

	let mayLoad = 1 in			let mayLoad = 1 in
	def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),			def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, memopr:$src3),			(ins RC:$src1, RC:$src2, memopr:$src3),
	!strconcat(OpcodeStr,			!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),			"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[]>;			[]>;
	}			}
	} // Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
	// hasSideEffects = 0

	multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,			multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
	string OpStr, string PackTy,			string OpStr, string PackTy,
	SDNode OpNode, RegisterClass RC,			SDNode OpNode, RegisterClass RC,
	X86MemOperand x86memop> {			X86MemOperand x86memop> {
	defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;			defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
	defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,			defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
	/* IsRVariantCommutable */ 1,
	/* IsMVariantCommutable */ 1,
	OpNode>;			OpNode>;
	defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC,			defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
	/* IsRVariantCommutable */ 1,
	/* IsMVariantCommutable */ 0,
	null_frag>;
	}			}

	// The FMA 213 form is created for lowering of scalar FMA intrinscis			// The FMA 213 form is created for lowering of scalar FMA intrinscis
	// to machine instructions.			// to machine instructions.
	// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands			// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
	// of FMA 213 form.			// of FMA 213 form.
	// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132			// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
	// forms and is possible only after special analysis of all uses of the initial			// forms and is possible only after special analysis of all uses of the initial
	▲ Show 20 Lines • Show All 235 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrInfo.h

Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines	public:
/// For example, calling this method this way:		/// For example, calling this method this way:
/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;		/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
/// findCommutedOpIndices(MI, Op1, Op2);		/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be		/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.		/// commutable with the operand#1.
bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,		bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;		unsigned &SrcOpIdx2) const override;

		/// Returns true if the routine could find two commutable operands
		/// in the given FMA instruction. Otherwise, returns false.
		///
		/// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
		/// The output indices of the commuted operands are returned in these
		/// arguments. Also, the input values of these arguments may be preset either
		/// to indices of operands that must be commuted or be equal to a special
		/// value 'CommuteAnyOperandIndex' which means that the corresponding
		/// operand index is not set and this method is free to pick any of
		/// available commutable operands.
		///
		/// For example, calling this method this way:
		/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
		/// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
		/// can be interpreted as a query asking if the operand #1 can be swapped
		/// with any other available operand (e.g. operand #2, operand #3, etc.).
		///
		/// The returned FMA opcode may differ from the opcode in the given MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		bool findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const;

		/// Returns an adjusted FMA opcode that must be used in FMA instruction that
		/// performs the same computations as the given MI but which has the operands
		/// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
		/// It may return 0 if it is unsafe to commute the operands.
		///
		/// The returned FMA opcode may differ from the opcode in the given \p MI.
		/// For example, commuting the operands #1 and #3 in the following FMA
		/// FMA213 #1, #2, #3
		/// results into instruction with adjusted opcode:
		/// FMA231 #3, #2, #1
		unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const;

// Branch analysis.		// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr* MI) const override;		bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,		bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,		MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,		SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;		bool AllowModify) const override;

bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,		bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
▲ Show 20 Lines • Show All 257 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,965 Lines • ▼ Show 20 Lines	if (LV) { // Update live variables
if (Dest.isDead())		if (Dest.isDead())
LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);		LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
}		}

MFI->insert(MBBI, NewMI); // Insert the new inst		MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;		return NewMI;
}		}

		/// Returns true if the given instruction opcode is FMA3.
		/// Otherwise, returns false.
		static bool isFMA3(unsigned Opcode) {
		switch (Opcode) {
		case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
		case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
		case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
		case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
		case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
		case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
		case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
		case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:

		case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
		case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
		case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
		case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
		case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
		case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
		case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
		case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:

		case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
		case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
		case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
		case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
		case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
		case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
		case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
		case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:

		case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
		case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
		case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
		case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
		case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
		case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
		case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
		case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:

		case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
		case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
		case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
		case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
		case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
		case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
		case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
		case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
		case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
		case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
		case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
		case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
		case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
		case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
		case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
		case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:

		case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
		case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
		case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
		case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
		case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
		case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
		case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
		case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:

		case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
		case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
		case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
		case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
		case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
		case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
		case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
		case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
		case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
		case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
		case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
		case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
		case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
		case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
		case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
		case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:

		case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
		case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
		case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
		case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
		case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
		case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
		case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
		case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:

		case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
		case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
		case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
		case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
		case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
		case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
		case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
		case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
		case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
		case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
		case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
		case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
		case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
		case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
		case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
		case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
		return true;
		default:
		return false;
		}
		llvm_unreachable("Opcode not handled by the switch");
		}

MachineInstr X86InstrInfo::commuteInstructionImpl(MachineInstr MI,		MachineInstr X86InstrInfo::commuteInstructionImpl(MachineInstr MI,
bool NewMI,		bool NewMI,
unsigned OpIdx1,		unsigned OpIdx1,
unsigned OpIdx2) const {		unsigned OpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)		case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)		case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)		case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);		MI = MF.CloneMachineInstr(MI);
NewMI = false;		NewMI = false;
}		}
MI->setDesc(get(Opc));		MI->setDesc(get(Opc));
// Fallthrough intended.		// Fallthrough intended.
}		}
default:		default:
		if (isFMA3(MI->getOpcode())) {
		unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
		if (Opc == 0)
		return nullptr;
		if (NewMI) {
		MachineFunction &MF = *MI->getParent()->getParent();
		MI = MF.CloneMachineInstr(MI);
		NewMI = false;
		}
		MI->setDesc(get(Opc));
		}
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);		return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}		}
}		}

		bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
		unsigned &SrcOpIdx1,
		unsigned &SrcOpIdx2) const {

		unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;

		// Only the first RegOpsNum operands are commutable.
		// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
		// that the operand is not specified/fixed.
		if (SrcOpIdx1 != CommuteAnyOperandIndex &&
		(SrcOpIdx1 < 1 \|\| SrcOpIdx1 > RegOpsNum))
		return false;
		if (SrcOpIdx2 != CommuteAnyOperandIndex &&
		(SrcOpIdx2 < 1 \|\| SrcOpIdx2 > RegOpsNum))
		return false;

		// Look for two different register operands assumed to be commutable
		// regardless of the FMA opcode. The FMA opcode is adjusted later.
		if (SrcOpIdx1 == CommuteAnyOperandIndex \|\|
		SrcOpIdx2 == CommuteAnyOperandIndex) {
		unsigned CommutableOpIdx1 = SrcOpIdx1;
		unsigned CommutableOpIdx2 = SrcOpIdx2;

		// At least one of operands to be commuted is not specified and
		// this method is free to choose appropriate commutable operands.
		if (SrcOpIdx1 == SrcOpIdx2)
		// Both of operands are not fixed. By default set one of commutable
		// operands to the last register operand of the instruction.
		CommutableOpIdx2 = RegOpsNum;
		else if (SrcOpIdx2 == CommuteAnyOperandIndex)
		// Only one of operands is not fixed.
		CommutableOpIdx2 = SrcOpIdx1;

		// CommutableOpIdx2 is well defined now. Let's choose another commutable
		// operand and assign its index to CommutableOpIdx1.
		unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
		for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
		// The commuted operands must have different registers.
		// Otherwise, the commute transformation does not change anything and
		// is useless then.
		if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
		break;
		}

		// No appropriate commutable operands were found.
		if (CommutableOpIdx1 == 0)
		return false;

		// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
		// to return those values.
		if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
		CommutableOpIdx1, CommutableOpIdx2))
		return false;
		}

		// Check if we can adjust the opcode to preserve the semantics when
		// commute the register operands.
		return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
		}

		unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
		unsigned SrcOpIdx1,
		unsigned SrcOpIdx2) const {
		unsigned Opc = MI->getOpcode();

		// Define the array that holds FMA opcodes in groups
		// of 3 opcodes(132, 213, 231) in each group.
		static const unsigned OpcodeGroups[][3] = {
		{ X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
		{ X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
		{ X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
		{ X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
		{ X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
		{ X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
		{ X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
		{ X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
		{ X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
		{ X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
		{ X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
		{ X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },

		{ X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
		{ X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
		{ X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
		{ X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
		{ X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
		{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
		{ X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
		{ X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
		{ X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
		{ X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
		{ X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
		{ X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },

		{ X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
		{ X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
		{ X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
		{ X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
		{ X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
		{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
		{ X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
		{ X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
		{ X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
		{ X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
		{ X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
		{ X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },

		{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
		{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
		{ X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
		{ X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
		{ X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
		{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
		{ X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
		{ X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
		{ X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
		{ X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
		{ X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
		{ X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },

		{ X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
		{ X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
		{ X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
		{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
		{ X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
		{ X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
		{ X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
		{ X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },

		{ X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
		{ X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
		{ X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
		{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
		{ X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
		{ X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
		{ X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
		{ X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
		};
		const unsigned Form132Index = 0;
		const unsigned Form213Index = 1;
		const unsigned Form231Index = 2;
		const unsigned FormsNum = 3;

		// Look for the input opcode in the OpcodeGroups table.
		unsigned OpcodeGroupsNum = sizeof(OpcodeGroups) / sizeof(OpcodeGroups[0]);
		unsigned GroupIndex = 0, FormIndex = FormsNum;
		for (; GroupIndex < OpcodeGroupsNum && FormIndex == FormsNum; GroupIndex++) {
		for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) {
		if (OpcodeGroups[GroupIndex][FormIndex] == Opc)
		break;
		}
		}
		// Input opcode does not match with any of the opcodes from the table.
		if (FormIndex == FormsNum)
		return 0;
		// Do not forget to fix the GroupIndex after the loop.
		GroupIndex--;

		// Put the lowest index to SrcOpIdx1 to simplify the checks below.
		if (SrcOpIdx1 > SrcOpIdx2)
		std::swap(SrcOpIdx1, SrcOpIdx2);

		unsigned Case;
		if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
		Case = 0;
		else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
		Case = 1;
		else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
		Case = 2;
		else
		return 0;

		// Define the FMA forms mapping array that helps to map input FMA form
		// to output FMA form to preserve the operation semantics after
		// commuting the operands.
		static const unsigned FormMapping[][3] = {
		// 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
		// FMA132 A, C, b; ==> FMA231 C, A, b;
		// FMA213 B, A, c; ==> FMA213 A, B, c;
		// FMA231 C, A, b; ==> FMA132 A, C, b;
		{ Form231Index, Form213Index, Form132Index },
		// 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
		// FMA132 A, c, B; ==> FMA132 B, c, A;
		// FMA213 B, a, C; ==> FMA231 C, a, B;
		// FMA231 C, a, B; ==> FMA213 B, a, C;
		{ Form132Index, Form231Index, Form213Index },
		// 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
		// FMA132 a, C, B; ==> FMA213 a, B, C;
		// FMA213 b, A, C; ==> FMA132 b, C, A;
		// FMA231 c, A, B; ==> FMA231 c, B, A;
		{ Form213Index, Form132Index, Form231Index }
		};

		// Everything is ready, just adjust the FMA opcode and return it.
		FormIndex = FormMapping[Case][FormIndex];
		return OpcodeGroups[GroupIndex][FormIndex];
		}

bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,		bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,		unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {		unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
case X86::CMPPDrri:		case X86::CMPPDrri:
case X86::CMPPSrri:		case X86::CMPPSrri:
case X86::VCMPPDrri:		case X86::VCMPPDrri:
case X86::VCMPPSrri:		case X86::VCMPPSrri:
case X86::VCMPPDYrri:		case X86::VCMPPDYrri:
case X86::VCMPPSYrri: {		case X86::VCMPPSYrri: {
// Float comparison can be safely commuted for		// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests		// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;		unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {		switch (Imm) {
case 0x00: // EQUAL		case 0x00: // EQUAL
case 0x03: // UNORDERED		case 0x03: // UNORDERED
case 0x04: // NOT EQUAL		case 0x04: // NOT EQUAL
case 0x07: // ORDERED		case 0x07: // ORDERED
// The indices of the commutable operands are 1 and 2.		// The indices of the commutable operands are 1 and 2.
// Assign them to the returned operand indices here.		// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);		return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}		}
return false;		return false;
}		}
case X86::VFMADDPDr231r:
case X86::VFMADDPSr231r:
case X86::VFMADDSDr231r:
case X86::VFMADDSSr231r:
case X86::VFMSUBPDr231r:
case X86::VFMSUBPSr231r:
case X86::VFMSUBSDr231r:
case X86::VFMSUBSSr231r:
case X86::VFNMADDPDr231r:
case X86::VFNMADDPSr231r:
case X86::VFNMADDSDr231r:
case X86::VFNMADDSSr231r:
case X86::VFNMSUBPDr231r:
case X86::VFNMSUBPSr231r:
case X86::VFNMSUBSDr231r:
case X86::VFNMSUBSSr231r:
case X86::VFMADDPDr231rY:
case X86::VFMADDPSr231rY:
case X86::VFMSUBPDr231rY:
case X86::VFMSUBPSr231rY:
case X86::VFNMADDPDr231rY:
case X86::VFNMADDPSr231rY:
case X86::VFNMSUBPDr231rY:
case X86::VFNMSUBPSr231rY:
// The indices of the commutable operands are 2 and 3.
// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
default:		default:
		if (isFMA3(MI->getOpcode()))
		return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);		return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}		}
return false;		return false;
}		}

static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {		static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {		switch (BrOpc) {
default: return X86::COND_INVALID;		default: return X86::COND_INVALID;
▲ Show 20 Lines • Show All 3,559 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/fma-commute-x86.ll

				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 \| FileCheck %s
				; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 \| FileCheck %s
				; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 \| FileCheck %s

				attributes #0 = { nounwind }

				declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_baa_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_aba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_bba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %xmm0
				; CHECK-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_baa_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_aba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_bba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %ymm0
				; CHECK-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_baa_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_aba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_bba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %xmm0
				; CHECK-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_baa_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_aba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmadd_bba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %ymm0
				; CHECK-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}



				declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_baa_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_aba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_bba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %xmm0
				; CHECK-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_baa_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_aba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_bba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %ymm0
				; CHECK-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_baa_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_aba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_bba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %xmm0
				; CHECK-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_baa_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_aba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmadd_bba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %ymm0
				; CHECK-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_baa_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_aba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_bba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %xmm0
				; CHECK-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_baa_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_aba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_bba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %ymm0
				; CHECK-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_baa_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_aba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_bba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %xmm0
				; CHECK-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_baa_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_aba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fmsub_bba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %ymm0
				; CHECK-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}


				declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
				define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_baa_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_aba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %xmm0
				; CHECK-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_bba_ps:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %xmm0
				; CHECK-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
				ret <4 x float> %res
				}

				declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
				define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_baa_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_aba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rcx), %ymm0
				; CHECK-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_bba_ps_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovaps (%rdx), %ymm0
				; CHECK-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
				ret <8 x float> %res
				}

				declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
				define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_baa_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_aba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %xmm0
				; CHECK-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_bba_pd:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %xmm0
				; CHECK-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
				; CHECK-NEXT: retq
				%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
				ret <2 x double> %res
				}

				declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
				define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_baa_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_aba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rcx), %ymm0
				; CHECK-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

				define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
				; CHECK-LABEL: test_x86_fnmsub_bba_pd_y:
				; CHECK: # BB#0:
				; CHECK-NEXT: vmovapd (%rdx), %ymm0
				; CHECK-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
				; CHECK-NEXT: retq
				%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
				ret <4 x double> %res
				}

llvm/trunk/test/CodeGen/X86/fma_patterns.ll

Show First 20 Lines • Show All 229 Lines • ▼ Show 20 Lines
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = fsub float -0.000000e+00, %a0		%x = fsub float -0.000000e+00, %a0
%y = fmul float %x, %a1		%y = fmul float %x, %a1
%res = fsub float %y, %a2		%res = fsub float %y, %a2
ret float %res		ret float %res
}		}

define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {		define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK_FMA-LABEL: test_x86_fmadd_ps_load:		; CHECK-FMA-LABEL: test_x86_fmadd_ps_load:
; CHECK_FMA: # BB#0:		; CHECK-FMA: # BB#0:
; CHECK_FMA-NEXT: vmovaps (%rdi), %xmm2		; CHECK-FMA-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
; CHECK_FMA-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0		; CHECK-FMA-NEXT: retq
; CHECK_FMA-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load:		; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0		%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1		%y = fmul <4 x float> %x, %a1
%res = fadd <4 x float> %y, %a2		%res = fadd <4 x float> %y, %a2
ret <4 x float> %res		ret <4 x float> %res
}		}

define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {		define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK_FMA-LABEL: test_x86_fmsub_ps_load:		; CHECK-FMA-LABEL: test_x86_fmsub_ps_load:
; CHECK_FMA: # BB#0:		; CHECK-FMA: # BB#0:
; CHECK_FMA-NEXT: vmovaps (%rdi), %xmm2		; CHECK-FMA-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0
; CHECK_FMA-NEXT: vfmsub213ps %xmm1, %xmm2, %xmm0		; CHECK-FMA-NEXT: retq
; CHECK_FMA-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load:		; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0		%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1		%y = fmul <4 x float> %x, %a1
%res = fsub <4 x float> %y, %a2		%res = fsub <4 x float> %y, %a2
▲ Show 20 Lines • Show All 315 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
ret <4 x float> %a		ret <4 x float> %a
}		}

; (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)		; (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)

define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {		define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
; CHECK_FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:		; CHECK_FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
; CHECK_FMA: # BB#0:		; CHECK_FMA: # BB#0:
; CHECK_FMA-NEXT: vmovaps {{.*#+}} xmm2 = [4.000000e+00,6.000000e+00,6.000000e+00,4.000000e+00]		; CHECK_FMA-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
; CHECK_FMA-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
; CHECK_FMA-NEXT: retq		; CHECK_FMA-NEXT: retq
;		;
; CHECK_FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:		; CHECK_FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
; CHECK_FMA4: # BB#0:		; CHECK_FMA4: # BB#0:
; CHECK_FMA4-NEXT: vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0		; CHECK_FMA4-NEXT: vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0
; CHECK_FMA4-NEXT: retq		; CHECK_FMA4-NEXT: retq
%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>		%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
%m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>		%m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
%a = fadd <4 x float> %m1, %y		%a = fadd <4 x float> %m1, %y
ret <4 x float> %a		ret <4 x float> %a
}		}

attributes #0 = { "unsafe-fp-math"="true" }		attributes #0 = { "unsafe-fp-math"="true" }

This is an archive of the discontinued LLVM Phabricator instance.

Improved X86-FMA3 mem-folding & coalescingClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 39573

llvm/trunk/lib/Target/X86/X86InstrFMA.td

llvm/trunk/lib/Target/X86/X86InstrInfo.h

llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

llvm/trunk/test/CodeGen/X86/fma-commute-x86.ll

llvm/trunk/test/CodeGen/X86/fma_patterns.ll

Improved X86-FMA3 mem-folding & coalescing
ClosedPublic