diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2695,6 +2695,7 @@ static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::BSWAP, MVT::i64, 1 }, { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i64, 10 }, @@ -2708,6 +2709,8 @@ { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::BSWAP, MVT::i32, 1 }, + { ISD::BSWAP, MVT::i16, 1 }, // MOVZX + ROL by 8 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV @@ -2919,6 +2922,17 @@ return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } + if (ST->hasMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + // TODO - add BMI (TZCNT) scalar handling if (ST->is64Bit()) diff --git a/llvm/test/Analysis/CostModel/X86/bswap-store.ll b/llvm/test/Analysis/CostModel/X86/bswap-store.ll --- a/llvm/test/Analysis/CostModel/X86/bswap-store.ll +++ b/llvm/test/Analysis/CostModel/X86/bswap-store.ll @@ -11,12 +11,12 @@ define void @var_bswap_store_i16(i16 %a, i16* %dst) { ; NOMOVBE-LABEL: 'var_bswap_store_i16' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; MOVBE-LABEL: 'var_bswap_store_i16' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 ; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -26,17 +26,11 @@ ret void } define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) { -; NOMOVBE-LABEL: 'var_bswap_store_i16_extrause' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; MOVBE-LABEL: 'var_bswap_store_i16_extrause' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; ALL-LABEL: 'var_bswap_store_i16_extrause' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2 +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %bswap = call i16 @llvm.bswap.i16(i16 %a) store i16 %bswap, i16* %dst, align 1 @@ -47,10 +41,15 @@ } define void @var_bswap_store_i32(i32 %a, i32* %dst) { -; ALL-LABEL: 'var_bswap_store_i32' -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 -; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; NOMOVBE-LABEL: 'var_bswap_store_i32' +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; MOVBE-LABEL: 'var_bswap_store_i32' +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1 +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %bswap = call i32 @llvm.bswap.i32(i32 %a) store i32 %bswap, i32* %dst, align 1 @@ -73,16 +72,6 @@ } define void @var_bswap_store_i64(i64 %a, i64* %dst) { -; X64-LABEL: 'var_bswap_store_i64' -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; X86-LABEL: 'var_bswap_store_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; %bswap = call i64 @llvm.bswap.i64(i64 %a) store i64 %bswap, i64* %dst, align 1 @@ -96,7 +85,7 @@ ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; X86-LABEL: 'var_bswap_store_i64_extrause' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -110,16 +99,6 @@ } define void @var_bswap_store_i128(i128 %a, i128* %dst) { -; X64-LABEL: 'var_bswap_store_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; X86-LABEL: 'var_bswap_store_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; %bswap = call i128 @llvm.bswap.i128(i128 %a) store i128 %bswap, i128* %dst, align 1 @@ -127,13 +106,13 @@ } define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) { ; X64-LABEL: 'var_bswap_store_i128_extrause' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1 ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2 ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; X86-LABEL: 'var_bswap_store_i128_extrause' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll --- a/llvm/test/Analysis/CostModel/X86/bswap.ll +++ b/llvm/test/Analysis/CostModel/X86/bswap.ll @@ -12,13 +12,9 @@ ; Verify the cost of scalar bswap instructions. define i16 @var_bswap_i16(i16 %a) { -; NOMOVBE-LABEL: 'var_bswap_i16' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap -; -; MOVBE-LABEL: 'var_bswap_i16' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; ALL-LABEL: 'var_bswap_i16' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %bswap = call i16 @llvm.bswap.i16(i16 %a) ret i16 %bswap @@ -39,7 +35,7 @@ ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap ; ; X86-LABEL: 'var_bswap_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap ; %bswap = call i64 @llvm.bswap.i64(i64 %a) @@ -48,11 +44,11 @@ define i128 @var_bswap_i128(i128 %a) { ; X64-LABEL: 'var_bswap_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; ; X86-LABEL: 'var_bswap_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; %bswap = call i128 @llvm.bswap.i128(i128 %a) diff --git a/llvm/test/Analysis/CostModel/X86/load-bswap.ll b/llvm/test/Analysis/CostModel/X86/load-bswap.ll --- a/llvm/test/Analysis/CostModel/X86/load-bswap.ll +++ b/llvm/test/Analysis/CostModel/X86/load-bswap.ll @@ -12,12 +12,12 @@ define i16 @var_load_bswap_i16(i16* %src) { ; NOMOVBE-LABEL: 'var_load_bswap_i16' ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; ; MOVBE-LABEL: 'var_load_bswap_i16' ; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) ; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %a = load i16, i16* %src, align 1 @@ -26,19 +26,12 @@ ret i16 %bswap } define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) { -; NOMOVBE-LABEL: 'var_load_bswap_i16_extrause' -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 -; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap -; -; MOVBE-LABEL: 'var_load_bswap_i16_extrause' -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 -; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap +; ALL-LABEL: 'var_load_bswap_i16_extrause' +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a) +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2 +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1 +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap ; %a = load i16, i16* %src, align 1 %bswap = call i16 @llvm.bswap.i16(i16 %a) @@ -50,10 +43,15 @@ } define i32 @var_load_bswap_i32(i32* %src) { -; ALL-LABEL: 'var_load_bswap_i32' -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 -; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) -; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap +; NOMOVBE-LABEL: 'var_load_bswap_i32' +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; NOMOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap +; +; MOVBE-LABEL: 'var_load_bswap_i32' +; MOVBE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1 +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a) +; MOVBE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap ; %a = load i32, i32* %src, align 1 %bswap = call i32 @llvm.bswap.i32(i32 %a) @@ -78,16 +76,6 @@ } define i64 @var_load_bswap_i64(i64* %src) { -; X64-LABEL: 'var_load_bswap_i64' -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap -; -; X86-LABEL: 'var_load_bswap_i64' -; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap -; %a = load i64, i64* %src, align 1 %bswap = call i64 @llvm.bswap.i64(i64 %a) @@ -103,7 +91,7 @@ ; ; X86-LABEL: 'var_load_bswap_i64_extrause' ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap @@ -118,16 +106,6 @@ } define i128 @var_load_bswap_i128(i128* %src) { -; X64-LABEL: 'var_load_bswap_i128' -; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap -; -; X86-LABEL: 'var_load_bswap_i128' -; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) -; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap -; %a = load i128, i128* %src, align 1 %bswap = call i128 @llvm.bswap.i128(i128 %a) @@ -136,14 +114,14 @@ define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) { ; X64-LABEL: 'var_load_bswap_i128_extrause' ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1 -; X64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2 ; X64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1 ; X64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap ; ; X86-LABEL: 'var_load_bswap_i128_extrause' ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1 -; X86-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) +; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a) ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2 ; X86-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1 ; X86-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll @@ -42,18 +42,30 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false) -; SLM-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.abs.i64(i64 [[A0]], i1 false) +; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.abs.i64(i64 [[A1]], i1 false) +; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.abs.i64(i64 [[A2]], i1 false) +; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.abs.i64(i64 [[A3]], i1 false) +; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.abs.i64(i64 [[A4]], i1 false) +; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.abs.i64(i64 [[A5]], i1 false) +; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.abs.i64(i64 [[A6]], i1 false) +; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.abs.i64(i64 [[A7]], i1 false) +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v8i64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -62,40 +62,31 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; +; AVX512-LABEL: @add_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; ; AVX1-LABEL: @add_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 @@ -114,7 +105,6 @@ ; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void -; ; AVX2-LABEL: @add_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -125,14 +115,6 @@ ; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @add_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 -; AVX512-NEXT: ret void -; ; AVX256BW-LABEL: @add_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -143,7 +125,6 @@ ; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void -; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -26,41 +26,6 @@ declare i8 @llvm.uadd.sat.i8 (i8 , i8 ) define void @add_v8i64() { -; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; ; AVX-LABEL: @add_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -62,40 +62,31 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) +; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; +; AVX512-LABEL: @sub_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; ; AVX1-LABEL: @sub_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 @@ -114,7 +105,6 @@ ; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void -; ; AVX2-LABEL: @sub_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -125,14 +115,6 @@ ; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @sub_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 -; AVX512-NEXT: ret void -; ; AVX256BW-LABEL: @sub_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -143,7 +125,6 @@ ; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void -; %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -26,41 +26,6 @@ declare i8 @llvm.usub.sat.i8 (i8 , i8 ) define void @sub_v8i64() { -; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; ; AVX-LABEL: @sub_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll @@ -22,11 +22,17 @@ declare i8 @llvm.bitreverse.i8(i8) define void @bitreverse_2i64() #0 { -; CHECK-LABEL: @bitreverse_2i64( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) -; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @bitreverse_2i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; XOP-LABEL: @bitreverse_2i64( +; XOP-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; XOP-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; XOP-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8