diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -514,6 +514,10 @@ "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true", "Prefer a left/right vector logical shift pair over a shift+and pair">; +def FeatureFastMOVBE + : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true", + "Prefer a movbe over a single-use load + bswap / single-use bswap + store">; + def FeatureUseGLMDivSqrtCosts : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", "Use Goldmont specific floating point div/sqrt costs">; @@ -820,6 +824,7 @@ FeatureSlowDivide64, FeatureSlowPMULLD, FeatureFast7ByteNOP, + FeatureFastMOVBE, FeaturePOPCNTFalseDeps, FeatureInsertVZEROUPPER]; list SLMFeatures = @@ -839,6 +844,7 @@ FeatureSlowTwoMemOps, FeatureSlowLEA, FeatureSlowIncDec, + FeatureFastMOVBE, FeaturePOPCNTFalseDeps, FeatureInsertVZEROUPPER]; list GLMFeatures = @@ -851,6 +857,7 @@ FeatureSlowTwoMemOps, FeatureSlowLEA, FeatureSlowIncDec, + FeatureFastMOVBE, FeatureInsertVZEROUPPER]; list GLPFeatures = !listconcat(GLMFeatures, GLPAdditionalFeatures); @@ -924,6 +931,7 @@ FeatureSlowTwoMemOps, FeaturePreferMaskRegisters, FeatureHasFastGather, + FeatureFastMOVBE, FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features list KNMFeatures = @@ -983,6 +991,7 @@ FeatureFast15ByteNOP, FeatureFastScalarShiftMasks, FeatureFastVectorShiftMasks, + FeatureFastMOVBE, FeatureSlowSHLD]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1017,7 +1026,9 @@ FeatureTBM, FeatureFMA, FeatureFastBEXTR]; - list BdVer2Tuning = BdVer1Tuning; + list BdVer2AdditionalTuning = [FeatureFastMOVBE]; + list BdVer2Tuning = + !listconcat(BdVer1Tuning, BdVer2AdditionalTuning); list BdVer2Features = !listconcat(BdVer1Features, BdVer2AdditionalFeatures); @@ -1077,6 +1088,7 @@ FeatureFast15ByteNOP, FeatureBranchFusion, FeatureFastScalarShiftMasks, + FeatureFastMOVBE, FeatureSlowSHLD, FeatureInsertVZEROUPPER]; list ZN2AdditionalFeatures = [FeatureCLWB, diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -433,6 +433,9 @@ /// Prefer a left/right vector logical shifts pair over a shift+and pair. bool HasFastVectorShiftMasks = false; + /// Prefer a movbe over a single-use load + bswap / single-use bswap + store. + bool HasFastMOVBE = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -714,6 +717,7 @@ bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } + bool hasFastMOVBE() const { return HasFastMOVBE; } bool hasMacroFusion() const { return HasMacroFusion; } bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2695,6 +2695,7 @@ static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::BSWAP, MVT::i64, 1 }, { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i64, 10 }, @@ -2708,6 +2709,8 @@ { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::BSWAP, MVT::i32, 1 }, + { ISD::BSWAP, MVT::i16, 1 }, // ROL { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV @@ -2919,6 +2922,17 @@ return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } + if (ST->hasMOVBE() && ST->hasFastMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + // TODO - add BMI (TZCNT) scalar handling if (ST->is64Bit()) diff --git a/llvm/test/Analysis/CostModel/X86/bswap-store.ll b/llvm/test/Analysis/CostModel/X86/bswap-store.ll --- a/llvm/test/Analysis/CostModel/X86/bswap-store.ll +++ b/llvm/test/Analysis/CostModel/X86/bswap-store.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X64,SLOWMOVBE +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X64,FASTMOVBE + +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86 +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X86,SLOWMOVBE +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X86,FASTMOVBE declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) @@ -11,14 +14,19 @@ define void @var_bswap_store_i16(i16 %a, i16* %dst) { ; NOMOVBE-LABEL: 'var_bswap_store_i16' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; MOVBE-LABEL: 'var_bswap_store_i16' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SLOWMOVBE-LABEL: 'var_bswap_store_i16' +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; FASTMOVBE-LABEL: 'var_bswap_store_i16' +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %bswap = call i16 @llvm.bswap.i16(i16 %a) store i16 %bswap, i16* %dst, align 1 @@ -26,17 +34,11 @@ ret void } define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) { -; NOMOVBE-LABEL: 'var_bswap_store_i16_extrause' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; MOVBE-LABEL: 'var_bswap_store_i16_extrause' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; ALL-LABEL: 'var_bswap_store_i16_extrause' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %bswap = call i16 @llvm.bswap.i16(i16 %a) store i16 %bswap, i16* %dst, align 1 @@ -47,10 +49,20 @@ } define void @var_bswap_store_i32(i32 %a, i32* %dst) { -; ALL-LABEL: 'var_bswap_store_i32' -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 -; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; NOMOVBE-LABEL: 'var_bswap_store_i32' +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLOWMOVBE-LABEL: 'var_bswap_store_i32' +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; FASTMOVBE-LABEL: 'var_bswap_store_i32' +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %bswap = call i32 @llvm.bswap.i32(i32 %a) store i32 %bswap, i32* %dst, align 1 @@ -73,16 +85,6 @@ } define void @var_bswap_store_i64(i64 %a, i64* %dst) { -; X64-LABEL: 'var_bswap_store_i64' -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; X86-LABEL: 'var_bswap_store_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; %bswap = call i64 @llvm.bswap.i64(i64 %a) store i64 %bswap, i64* %dst, align 1 @@ -96,7 +98,7 @@ ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; X86-LABEL: 'var_bswap_store_i64_extrause' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -110,16 +112,6 @@ } define void @var_bswap_store_i128(i128 %a, i128* %dst) { -; X64-LABEL: 'var_bswap_store_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; X86-LABEL: 'var_bswap_store_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; %bswap = call i128 @llvm.bswap.i128(i128 %a) store i128 %bswap, i128* %dst, align 1 @@ -127,13 +119,13 @@ } define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) { ; X64-LABEL: 'var_bswap_store_i128_extrause' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1 ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2 ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; X86-LABEL: 'var_bswap_store_i128_extrause' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll --- a/llvm/test/Analysis/CostModel/X86/bswap.ll +++ b/llvm/test/Analysis/CostModel/X86/bswap.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,X64 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X64 +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,X86 +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X86 declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) @@ -12,13 +12,9 @@ ; Verify the cost of scalar bswap instructions. define i16 @var_bswap_i16(i16 %a) { -; NOMOVBE-LABEL: 'var_bswap_i16' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap -; -; MOVBE-LABEL: 'var_bswap_i16' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; ALL-LABEL: 'var_bswap_i16' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %bswap = call i16 @llvm.bswap.i16(i16 %a) ret i16 %bswap @@ -39,7 +35,7 @@ ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap ; ; X86-LABEL: 'var_bswap_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap ; %bswap = call i64 @llvm.bswap.i64(i64 %a) @@ -48,11 +44,11 @@ define i128 @var_bswap_i128(i128 %a) { ; X64-LABEL: 'var_bswap_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; ; X86-LABEL: 'var_bswap_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; %bswap = call i128 @llvm.bswap.i128(i128 %a) diff --git a/llvm/test/Analysis/CostModel/X86/load-bswap.ll b/llvm/test/Analysis/CostModel/X86/load-bswap.ll --- a/llvm/test/Analysis/CostModel/X86/load-bswap.ll +++ b/llvm/test/Analysis/CostModel/X86/load-bswap.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86 -; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X64,SLOWMOVBE +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X64,FASTMOVBE + +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86 +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X86,SLOWMOVBE +; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X86,FASTMOVBE declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) @@ -12,13 +15,18 @@ define i16 @var_load_bswap_i16(i16* %src) { ; NOMOVBE-LABEL: 'var_load_bswap_i16' ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; -; MOVBE-LABEL: 'var_load_bswap_i16' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; SLOWMOVBE-LABEL: 'var_load_bswap_i16' +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; +; FASTMOVBE-LABEL: 'var_load_bswap_i16' +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %a = load i16, i16* %src, align 1 %bswap = call i16 @llvm.bswap.i16(i16 %a) @@ -26,19 +34,12 @@ ret i16 %bswap } define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) { -; NOMOVBE-LABEL: 'var_load_bswap_i16_extrause' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap -; -; MOVBE-LABEL: 'var_load_bswap_i16_extrause' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; ALL-LABEL: 'var_load_bswap_i16_extrause' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %a = load i16, i16* %src, align 1 %bswap = call i16 @llvm.bswap.i16(i16 %a) @@ -50,10 +51,20 @@ } define i32 @var_load_bswap_i32(i32* %src) { -; ALL-LABEL: 'var_load_bswap_i32' -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) -; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap +; NOMOVBE-LABEL: 'var_load_bswap_i32' +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap +; +; SLOWMOVBE-LABEL: 'var_load_bswap_i32' +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; SLOWMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap +; +; FASTMOVBE-LABEL: 'var_load_bswap_i32' +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; FASTMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap ; %a = load i32, i32* %src, align 1 %bswap = call i32 @llvm.bswap.i32(i32 %a) @@ -78,16 +89,6 @@ } define i64 @var_load_bswap_i64(i64* %src) { -; X64-LABEL: 'var_load_bswap_i64' -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap -; -; X86-LABEL: 'var_load_bswap_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap -; %a = load i64, i64* %src, align 1 %bswap = call i64 @llvm.bswap.i64(i64 %a) @@ -103,7 +104,7 @@ ; ; X86-LABEL: 'var_load_bswap_i64_extrause' ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap @@ -118,16 +119,6 @@ } define i128 @var_load_bswap_i128(i128* %src) { -; X64-LABEL: 'var_load_bswap_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap -; -; X86-LABEL: 'var_load_bswap_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap -; %a = load i128, i128* %src, align 1 %bswap = call i128 @llvm.bswap.i128(i128 %a) @@ -136,14 +127,14 @@ define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) { ; X64-LABEL: 'var_load_bswap_i128_extrause' ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2 ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1 ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; ; X86-LABEL: 'var_load_bswap_i128_extrause' ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll @@ -42,18 +42,30 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false) -; SLM-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.abs.i64(i64 [[A0]], i1 false) +; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.abs.i64(i64 [[A1]], i1 false) +; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.abs.i64(i64 [[A2]], i1 false) +; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.abs.i64(i64 [[A3]], i1 false) +; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.abs.i64(i64 [[A4]], i1 false) +; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.abs.i64(i64 [[A5]], i1 false) +; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.abs.i64(i64 [[A6]], i1 false) +; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.abs.i64(i64 [[A7]], i1 false) +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v8i64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -62,40 +62,31 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; +; AVX512-LABEL: @add_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; ; AVX1-LABEL: @add_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 @@ -114,7 +105,6 @@ ; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void -; ; AVX2-LABEL: @add_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -125,14 +115,6 @@ ; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @add_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 -; AVX512-NEXT: ret void -; ; AVX256BW-LABEL: @add_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -143,7 +125,6 @@ ; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void -; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -26,41 +26,6 @@ declare i8 @llvm.uadd.sat.i8 (i8 , i8 ) define void @add_v8i64() { -; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; ; AVX-LABEL: @add_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -62,40 +62,31 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; +; AVX512-LABEL: @sub_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; ; AVX1-LABEL: @sub_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 @@ -114,7 +105,6 @@ ; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void -; ; AVX2-LABEL: @sub_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -125,14 +115,6 @@ ; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @sub_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 -; AVX512-NEXT: ret void -; ; AVX256BW-LABEL: @sub_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -143,7 +125,6 @@ ; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void -; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -26,41 +26,6 @@ declare i8 @llvm.usub.sat.i8 (i8 , i8 ) define void @sub_v8i64() { -; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; ; AVX-LABEL: @sub_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll @@ -22,11 +22,17 @@ declare i8 @llvm.bitreverse.i8(i8) define void @bitreverse_2i64() #0 { -; CHECK-LABEL: @bitreverse_2i64( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) -; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @bitreverse_2i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; XOP-LABEL: @bitreverse_2i64( +; XOP-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; XOP-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; XOP-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8