diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -790,6 +790,9 @@ return true; } + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; + bool isCtlzFast() const override { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1621,6 +1621,35 @@ return VT.isScalarInteger(); } +bool PPCTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + if (!Subtarget.isPPC64() || !Subtarget.hasVSX()) + return false; + // Expect extract one constant-index element from integer vector. + if (!isa(Idx)) + return false; + auto *VTy = dyn_cast(VectorTy); + if (!VTy || !VTy->getScalarType()->isIntegerTy()) + return false; + // The element index defines the region that can be stored without conversion. + // It depends on store width and endianness. + unsigned BitWidth = VTy->getScalarSizeInBits(); + unsigned ElemIdx; + if (BitWidth == 32 && Subtarget.hasP8Vector()) { + ElemIdx = Subtarget.isLittleEndian() ? 2 : 1; + } else if (BitWidth == 64) { + ElemIdx = Subtarget.isLittleEndian() ? 1 : 0; + } else + return false; + // Accept the combine only if the element index matches the index from extract + // operation. + if (cast(Idx)->getZExtValue() == ElemIdx) { + Cost = 1; + return true; + } + return false; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; diff --git a/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll b/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll --- a/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll +++ b/llvm/test/CodeGen/PowerPC/p10-fi-elim.ll @@ -27,38 +27,36 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: .cfi_offset lr, 16 ; CHECK-NEXT: lxv v2, 0(r3) +; CHECK-NEXT: lbz r7, 2(r7) ; CHECK-NEXT: mr r9, r6 ; CHECK-NEXT: mr r6, r5 -; CHECK-NEXT: li r12, -127 -; CHECK-NEXT: li r0, 4 -; CHECK-NEXT: stb r12, 0(r3) -; CHECK-NEXT: li r2, 1 -; CHECK-NEXT: std r0, 0(r3) -; CHECK-NEXT: stw r2, 0(r3) -; CHECK-NEXT: li r11, 3 -; CHECK-NEXT: stb r11, 0(0) -; CHECK-NEXT: mfvsrd r5, v2 -; CHECK-NEXT: vaddudm v3, v2, v2 +; CHECK-NEXT: li r5, 3 +; CHECK-NEXT: li r10, -127 +; CHECK-NEXT: li r11, 4 +; CHECK-NEXT: stb r5, 0(0) +; CHECK-NEXT: li r12, 1 +; CHECK-NEXT: stb r10, 0(r3) +; CHECK-NEXT: std r11, 0(r3) +; CHECK-NEXT: stw r12, 0(r3) +; CHECK-NEXT: vnegd v3, v2 +; CHECK-NEXT: vaddudm v4, v2, v2 ; CHECK-NEXT: pstxv v2, 64(r1), 0 -; CHECK-NEXT: neg r5, r5 -; CHECK-NEXT: mfvsrd r10, v3 -; CHECK-NEXT: std r5, 0(r3) -; CHECK-NEXT: lbz r5, 2(r7) +; CHECK-NEXT: stxsd v3, 0(r3) +; CHECK-NEXT: stb r5, 0(r3) +; CHECK-NEXT: rlwinm r5, r7, 0, 27, 27 ; CHECK-NEXT: mr r7, r9 -; CHECK-NEXT: stb r11, 0(r3) -; CHECK-NEXT: stb r12, 0(r3) -; CHECK-NEXT: std r2, 0(r3) -; CHECK-NEXT: neg r10, r10 -; CHECK-NEXT: rlwinm r5, r5, 0, 27, 27 +; CHECK-NEXT: stb r10, 0(r3) +; CHECK-NEXT: std r12, 0(r3) +; CHECK-NEXT: vnegd v2, v4 ; CHECK-NEXT: stb r5, 0(0) ; CHECK-NEXT: lbz r5, 2(r8) ; CHECK-NEXT: rlwinm r5, r5, 0, 27, 27 ; CHECK-NEXT: stb r5, 0(r3) ; CHECK-NEXT: li r5, 2 -; CHECK-NEXT: std r0, 0(r3) +; CHECK-NEXT: std r11, 0(r3) ; CHECK-NEXT: stw r5, 0(r3) ; CHECK-NEXT: mr r5, r4 -; CHECK-NEXT: std r10, 0(r3) +; CHECK-NEXT: stxsd v2, 0(r3) ; CHECK-NEXT: bl foo@notoc ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: addi r1, r1, 80 diff --git a/llvm/test/CodeGen/PowerPC/vector-promotion.ll b/llvm/test/CodeGen/PowerPC/vector-promotion.ll --- a/llvm/test/CodeGen/PowerPC/vector-promotion.ll +++ b/llvm/test/CodeGen/PowerPC/vector-promotion.ll @@ -1,4 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - -S | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-NORMAL-COMMON --check-prefix=IR-NORMAL-P8P9 %s +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-STRESS-COMMON %s +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - -S | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-NORMAL-COMMON --check-prefix=IR-NORMAL-P8P9 %s +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-STRESS-COMMON %s +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 %s -o - -S | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-NORMAL-COMMON --check-prefix=IR-NORMAL-P10 %s +; RUN: opt -codegenprepare -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH-COMMON --check-prefix=IR-STRESS-COMMON %s + +; RUN: opt -codegenprepare -mtriple=powerpc64-ibm-aix -mcpu=pwr9 %s -o - -S | FileCheck --check-prefix=IR-BE-BOTH-COMMON --check-prefix=IR-BE-NORMAL-P9 %s +; RUN: opt -codegenprepare -mtriple=powerpc64-ibm-aix -mcpu=pwr9 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BE-BOTH-COMMON --check-prefix=IR-BE-STRESS-COMMON %s +; RUN: opt -codegenprepare -mtriple=powerpc64-ibm-aix -mcpu=pwr10 %s -o - -S | FileCheck --check-prefix=IR-BE-BOTH-COMMON --check-prefix=IR-BE-NORMAL-P10 %s +; RUN: opt -codegenprepare -mtriple=powerpc64-ibm-aix -mcpu=pwr10 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BE-BOTH-COMMON --check-prefix=IR-BE-STRESS-COMMON %s + ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck --check-prefix=ASM-P8 %s ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - -stress-cgp-store-extract | FileCheck --check-prefix=ASM-STRESS-P8 %s ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - | FileCheck --check-prefix=ASM-P9 %s @@ -11,12 +23,21 @@ ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 %s -o - | FileCheck --check-prefix=ASM-P10-BE %s ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 %s -o - -stress-cgp-store-extract | FileCheck --check-prefix=ASM-STRESS-P10-BE %s +; IR-BOTH-COMMON-LABEL: @chainOfInstructionsToPromote +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], +; IR-BOTH-COMMON-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[VECTOR_OR1]], +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR2]], i32 2 +; IR-BOTH-COMMON-NEXT: store i32 [[EXTRACT]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @chainOfInstructionsToPromote(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: chainOfInstructionsToPromote: ; ASM-P8: # %bb.0: -; ASM-P8-NEXT: lwz 3, 8(3) -; ASM-P8-NEXT: ori 3, 3, 1 -; ASM-P8-NEXT: stw 3, 0(4) +; ASM-P8-NEXT: lxvd2x 0, 0, 3 +; ASM-P8-NEXT: vspltisw 2, 1 +; ASM-P8-NEXT: xxswapd 35, 0 +; ASM-P8-NEXT: xxlor 0, 35, 34 +; ASM-P8-NEXT: stfiwx 0, 0, 4 ; ASM-P8-NEXT: blr ; ; ASM-STRESS-P8-LABEL: chainOfInstructionsToPromote: @@ -30,9 +51,10 @@ ; ; ASM-P9-LABEL: chainOfInstructionsToPromote: ; ASM-P9: # %bb.0: -; ASM-P9-NEXT: lwz 3, 8(3) -; ASM-P9-NEXT: ori 3, 3, 1 -; ASM-P9-NEXT: stw 3, 0(4) +; ASM-P9-NEXT: lxv 0, 0(3) +; ASM-P9-NEXT: vspltisw 2, 1 +; ASM-P9-NEXT: xxlor 0, 0, 34 +; ASM-P9-NEXT: stfiwx 0, 0, 4 ; ASM-P9-NEXT: blr ; ; ASM-STRESS-P9-LABEL: chainOfInstructionsToPromote: @@ -45,9 +67,10 @@ ; ; ASM-P10-LABEL: chainOfInstructionsToPromote: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: ori 3, 3, 1 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 0, 0(3) +; ASM-P10-NEXT: vspltisw 2, 1 +; ASM-P10-NEXT: xxlor 0, 0, 34 +; ASM-P10-NEXT: stfiwx 0, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: chainOfInstructionsToPromote: @@ -65,6 +88,13 @@ ret void } +; IR-BE-BOTH-COMMON-LABEL: @chainOfInstructionsToPromoteBE +; IR-BE-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BE-BOTH-COMMON-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], +; IR-BE-BOTH-COMMON-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[VECTOR_OR1]], +; IR-BE-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR2]], i32 1 +; IR-BE-BOTH-COMMON-NEXT: store i32 [[EXTRACT]], ptr %dest +; IR-BE-BOTH-COMMON-NEXT: ret define void @chainOfInstructionsToPromoteBE(ptr %addr1, ptr %dest) { ; ASM-P9-BE-LABEL: chainOfInstructionsToPromoteBE: ; ASM-P9-BE: # %bb.0: @@ -105,6 +135,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @fdivCaseFloat +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x float>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[LOAD]], i32 2 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <4 x float> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[DIV]], i32 2 +; +; IR-BOTH-COMMON-NEXT: store float [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @fdivCaseFloat(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: fdivCaseFloat: ; ASM-P8: # %bb.0: @@ -170,6 +211,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @fdivCaseDouble +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x double>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[LOAD]], i32 1 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv double [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x double> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[DIV]], i32 1 +; +; IR-BOTH-COMMON-NEXT: store double [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @fdivCaseDouble(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: fdivCaseDouble: ; ASM-P8: # %bb.0: @@ -232,6 +284,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @fremCaseFloat +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x float>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[LOAD]], i32 2 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <4 x float> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[DIV]], i32 2 +; +; IR-BOTH-COMMON-NEXT: store float [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @fremCaseFloat(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: fremCaseFloat: ; ASM-P8: # %bb.0: @@ -377,6 +440,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @fremCaseDouble +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x double>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[LOAD]], i32 1 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem double [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x double> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[DIV]], i32 1 +; +; IR-BOTH-COMMON-NEXT: store double [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @fremCaseDouble(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: fremCaseDouble: ; ASM-P8: # %bb.0: @@ -518,6 +592,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @sdivCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <4 x i32> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <4 x i32> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; +; IR-BOTH-COMMON: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @sdivCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: sdivCase32: ; ASM-P8: # %bb.0: @@ -575,14 +662,16 @@ ; ; ASM-P10-LABEL: sdivCase32: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: pli 5, -1840700269 -; ASM-P10-NEXT: mulhw 5, 3, 5 -; ASM-P10-NEXT: add 3, 5, 3 -; ASM-P10-NEXT: srwi 5, 3, 31 -; ASM-P10-NEXT: srawi 3, 3, 2 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxspltiw 35, -1840700269 +; ASM-P10-NEXT: vspltisw 4, 2 +; ASM-P10-NEXT: vmulhsw 3, 2, 3 +; ASM-P10-NEXT: vadduwm 2, 3, 2 +; ASM-P10-NEXT: xxspltiw 35, 31 +; ASM-P10-NEXT: vsrw 3, 2, 3 +; ASM-P10-NEXT: vsraw 2, 2, 4 +; ASM-P10-NEXT: vadduwm 2, 2, 3 +; ASM-P10-NEXT: stxsiwx 34, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: sdivCase32: @@ -605,6 +694,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @sdivCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i64 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i64> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; +; IR-BOTH-COMMON: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @sdivCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: sdivCase64: ; ASM-P8: # %bb.0: @@ -672,15 +774,18 @@ ; ; ASM-P10-LABEL: sdivCase64: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: ld 3, 8(3) -; ASM-P10-NEXT: pli 5, 1227133513 -; ASM-P10-NEXT: pli 6, 613566757 -; ASM-P10-NEXT: rldimi 6, 5, 32, 0 -; ASM-P10-NEXT: mulhd 3, 3, 6 -; ASM-P10-NEXT: rldicl 5, 3, 1, 63 -; ASM-P10-NEXT: sradi 3, 3, 1 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: std 3, 0(4) +; ASM-P10-NEXT: xxsplti32dx 35, 0, 1227133513 +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxlxor 36, 36, 36 +; ASM-P10-NEXT: xxsplti32dx 36, 1, 63 +; ASM-P10-NEXT: xxsplti32dx 35, 1, 613566757 +; ASM-P10-NEXT: vmulhsd 2, 2, 3 +; ASM-P10-NEXT: xxlxor 35, 35, 35 +; ASM-P10-NEXT: xxsplti32dx 35, 1, 1 +; ASM-P10-NEXT: vsrd 4, 2, 4 +; ASM-P10-NEXT: vsrad 2, 2, 3 +; ASM-P10-NEXT: vaddudm 2, 2, 4 +; ASM-P10-NEXT: stxsd 2, 0(4) ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: sdivCase64: @@ -705,6 +810,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @sremCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i64 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i64> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; +; IR-BOTH-COMMON: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @sremCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: sremCase64: ; ASM-P8: # %bb.0: @@ -784,18 +902,23 @@ ; ; ASM-P10-LABEL: sremCase64: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: ld 3, 8(3) -; ASM-P10-NEXT: pli 5, 1227133513 -; ASM-P10-NEXT: pli 6, 613566757 -; ASM-P10-NEXT: rldimi 6, 5, 32, 0 -; ASM-P10-NEXT: mulhd 5, 3, 6 -; ASM-P10-NEXT: rldicl 6, 5, 1, 63 -; ASM-P10-NEXT: sradi 5, 5, 1 -; ASM-P10-NEXT: add 5, 5, 6 -; ASM-P10-NEXT: sldi 6, 5, 3 -; ASM-P10-NEXT: sub 5, 5, 6 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: std 3, 0(4) +; ASM-P10-NEXT: xxsplti32dx 35, 0, 1227133513 +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxlxor 37, 37, 37 +; ASM-P10-NEXT: xxlxor 32, 32, 32 +; ASM-P10-NEXT: xxlxor 36, 36, 36 +; ASM-P10-NEXT: xxsplti32dx 37, 1, 63 +; ASM-P10-NEXT: xxsplti32dx 32, 1, 1 +; ASM-P10-NEXT: xxsplti32dx 36, 1, 3 +; ASM-P10-NEXT: xxsplti32dx 35, 1, 613566757 +; ASM-P10-NEXT: vmulhsd 3, 2, 3 +; ASM-P10-NEXT: vsrd 5, 3, 5 +; ASM-P10-NEXT: vsrad 3, 3, 0 +; ASM-P10-NEXT: vaddudm 3, 3, 5 +; ASM-P10-NEXT: vsld 4, 3, 4 +; ASM-P10-NEXT: vsubudm 3, 3, 4 +; ASM-P10-NEXT: vaddudm 2, 2, 3 +; ASM-P10-NEXT: stxsd 2, 0(4) ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: sremCase64: @@ -825,6 +948,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @sremCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = srem <4 x i32> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = srem <4 x i32> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; +; IR-BOTH-COMMON: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @sremCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: sremCase32: ; ASM-P8: # %bb.0: @@ -894,17 +1030,20 @@ ; ; ASM-P10-LABEL: sremCase32: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: pli 5, -1840700269 -; ASM-P10-NEXT: mulhw 5, 3, 5 -; ASM-P10-NEXT: add 5, 5, 3 -; ASM-P10-NEXT: srwi 6, 5, 31 -; ASM-P10-NEXT: srawi 5, 5, 2 -; ASM-P10-NEXT: add 5, 5, 6 -; ASM-P10-NEXT: slwi 6, 5, 3 -; ASM-P10-NEXT: sub 5, 5, 6 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxspltiw 35, -1840700269 +; ASM-P10-NEXT: xxspltiw 36, 31 +; ASM-P10-NEXT: vspltisw 5, 2 +; ASM-P10-NEXT: vmulhsw 3, 2, 3 +; ASM-P10-NEXT: vadduwm 3, 3, 2 +; ASM-P10-NEXT: vsrw 4, 3, 4 +; ASM-P10-NEXT: vsraw 3, 3, 5 +; ASM-P10-NEXT: vadduwm 3, 3, 4 +; ASM-P10-NEXT: vspltisw 4, 3 +; ASM-P10-NEXT: vslw 4, 3, 4 +; ASM-P10-NEXT: vsubuwm 3, 3, 4 +; ASM-P10-NEXT: vadduwm 2, 2, 3 +; ASM-P10-NEXT: stxsiwx 34, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: sremCase32: @@ -931,6 +1070,21 @@ ret void } +; Check that we promote we a splat constant when this is a division. +; The NORMAL mode does not promote anything as divisions are not legal. +; IR-BOTH-COMMON-LABEL: @udivCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i64 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i64> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; +; IR-BOTH-COMMON: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @udivCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: udivCase64: ; ASM-P8: # %bb.0: @@ -1002,16 +1156,19 @@ ; ; ASM-P10-LABEL: udivCase64: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: ld 3, 8(3) -; ASM-P10-NEXT: pli 5, 613566756 -; ASM-P10-NEXT: pli 6, 2454267027 -; ASM-P10-NEXT: rldimi 6, 5, 32, 0 -; ASM-P10-NEXT: mulhdu 5, 3, 6 -; ASM-P10-NEXT: sub 3, 3, 5 -; ASM-P10-NEXT: rldicl 3, 3, 63, 1 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: rldicl 3, 3, 62, 2 -; ASM-P10-NEXT: std 3, 0(4) +; ASM-P10-NEXT: xxsplti32dx 35, 0, 613566756 +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxlxor 37, 37, 37 +; ASM-P10-NEXT: xxlxor 36, 36, 36 +; ASM-P10-NEXT: xxsplti32dx 37, 1, 1 +; ASM-P10-NEXT: xxsplti32dx 36, 1, 2 +; ASM-P10-NEXT: xxsplti32dx 35, 1, -1840700269 +; ASM-P10-NEXT: vmulhud 3, 2, 3 +; ASM-P10-NEXT: vsubudm 2, 2, 3 +; ASM-P10-NEXT: vsrd 2, 2, 5 +; ASM-P10-NEXT: vaddudm 2, 2, 3 +; ASM-P10-NEXT: vsrd 2, 2, 4 +; ASM-P10-NEXT: stxsd 2, 0(4) ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: udivCase64: @@ -1037,6 +1194,21 @@ ret void } +; Check that we promote we a splat constant when this is a division. +; The NORMAL mode does not promote anything as divisions are not legal. +; IR-BOTH-COMMON-LABEL: @udivCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <4 x i32> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <4 x i32> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; +; IR-BOTH-COMMON: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @udivCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: udivCase32: ; ASM-P8: # %bb.0: @@ -1094,14 +1266,16 @@ ; ; ASM-P10-LABEL: udivCase32: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: pli 5, 613566757 -; ASM-P10-NEXT: mulhwu 5, 3, 5 -; ASM-P10-NEXT: sub 3, 3, 5 -; ASM-P10-NEXT: srwi 3, 3, 1 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: srwi 3, 3, 2 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxspltiw 35, 613566757 +; ASM-P10-NEXT: vspltisw 4, 1 +; ASM-P10-NEXT: vmulhuw 3, 2, 3 +; ASM-P10-NEXT: vsubuwm 2, 2, 3 +; ASM-P10-NEXT: vsrw 2, 2, 4 +; ASM-P10-NEXT: vadduwm 2, 2, 3 +; ASM-P10-NEXT: vspltisw 3, 2 +; ASM-P10-NEXT: vsrw 2, 2, 3 +; ASM-P10-NEXT: stxsiwx 34, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: udivCase32: @@ -1124,6 +1298,14 @@ ret void } +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-COMMON-LABEL: @undefDivCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-BOTH-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i64 7, [[EXTRACT]] +; IR-BOTH-COMMON-NEXT: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefDivCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefDivCase64: ; ASM-P8: # %bb.0: @@ -1183,6 +1365,14 @@ ret void } +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-COMMON-LABEL: @undefDivCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-BOTH-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]] +; IR-BOTH-COMMON-NEXT: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefDivCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefDivCase32: ; ASM-P8: # %bb.0: @@ -1238,6 +1428,14 @@ ret void } +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-COMMON-LABEL: @undefRemCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-BOTH-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i64 7, [[EXTRACT]] +; IR-BOTH-COMMON-NEXT: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefRemCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefRemCase64: ; ASM-P8: # %bb.0: @@ -1301,6 +1499,14 @@ ret void } +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-COMMON-LABEL: @undefRemCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-BOTH-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]] +; IR-BOTH-COMMON-NEXT: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefRemCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefRemCase32: ; ASM-P8: # %bb.0: @@ -1360,6 +1566,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @uremCase64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i64 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i64> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[DIV]], i32 1 +; +; IR-BOTH-COMMON: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @uremCase64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: uremCase64: ; ASM-P8: # %bb.0: @@ -1443,19 +1662,24 @@ ; ; ASM-P10-LABEL: uremCase64: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: ld 3, 8(3) -; ASM-P10-NEXT: pli 5, 613566756 -; ASM-P10-NEXT: pli 6, 2454267027 -; ASM-P10-NEXT: rldimi 6, 5, 32, 0 -; ASM-P10-NEXT: mulhdu 5, 3, 6 -; ASM-P10-NEXT: sub 6, 3, 5 -; ASM-P10-NEXT: rldicl 6, 6, 63, 1 -; ASM-P10-NEXT: add 5, 6, 5 -; ASM-P10-NEXT: rldicl 6, 5, 62, 2 -; ASM-P10-NEXT: rldicr 5, 5, 1, 60 -; ASM-P10-NEXT: sub 5, 6, 5 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: std 3, 0(4) +; ASM-P10-NEXT: xxsplti32dx 35, 0, 613566756 +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxlxor 32, 32, 32 +; ASM-P10-NEXT: xxlxor 37, 37, 37 +; ASM-P10-NEXT: xxsplti32dx 32, 1, 1 +; ASM-P10-NEXT: xxsplti32dx 37, 1, 3 +; ASM-P10-NEXT: xxsplti32dx 35, 1, -1840700269 +; ASM-P10-NEXT: vmulhud 3, 2, 3 +; ASM-P10-NEXT: vsubudm 4, 2, 3 +; ASM-P10-NEXT: vsrd 4, 4, 0 +; ASM-P10-NEXT: vaddudm 3, 4, 3 +; ASM-P10-NEXT: xxlxor 36, 36, 36 +; ASM-P10-NEXT: xxsplti32dx 36, 1, 2 +; ASM-P10-NEXT: vsrd 3, 3, 4 +; ASM-P10-NEXT: vsld 4, 3, 5 +; ASM-P10-NEXT: vsubudm 3, 3, 4 +; ASM-P10-NEXT: vaddudm 2, 2, 3 +; ASM-P10-NEXT: stxsd 2, 0(4) ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: uremCase64: @@ -1486,6 +1710,19 @@ ret void } +; IR-BOTH-COMMON-LABEL: @uremCase32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-P8P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-NORMAL-P8P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7 +; IR-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = urem <4 x i32> [[LOAD]], +; IR-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; Vector version: +; IR-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = urem <4 x i32> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 2 +; +; IR-BOTH-COMMON: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @uremCase32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: uremCase32: ; ASM-P8: # %bb.0: @@ -1555,17 +1792,20 @@ ; ; ASM-P10-LABEL: uremCase32: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: pli 5, 613566757 -; ASM-P10-NEXT: mulhwu 5, 3, 5 -; ASM-P10-NEXT: sub 6, 3, 5 -; ASM-P10-NEXT: srwi 6, 6, 1 -; ASM-P10-NEXT: add 5, 6, 5 -; ASM-P10-NEXT: srwi 6, 5, 2 -; ASM-P10-NEXT: rlwinm 5, 5, 1, 0, 28 -; ASM-P10-NEXT: sub 5, 6, 5 -; ASM-P10-NEXT: add 3, 3, 5 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 34, 0(3) +; ASM-P10-NEXT: xxspltiw 35, 613566757 +; ASM-P10-NEXT: vspltisw 5, 1 +; ASM-P10-NEXT: vmulhuw 3, 2, 3 +; ASM-P10-NEXT: vsubuwm 4, 2, 3 +; ASM-P10-NEXT: vsrw 4, 4, 5 +; ASM-P10-NEXT: vadduwm 3, 4, 3 +; ASM-P10-NEXT: vspltisw 4, 2 +; ASM-P10-NEXT: vsrw 3, 3, 4 +; ASM-P10-NEXT: vspltisw 4, 3 +; ASM-P10-NEXT: vslw 4, 3, 4 +; ASM-P10-NEXT: vsubuwm 3, 3, 4 +; ASM-P10-NEXT: vadduwm 2, 2, 3 +; ASM-P10-NEXT: stxsiwx 34, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: uremCase32: @@ -1592,6 +1832,19 @@ ret void } +; IR-BE-BOTH-COMMON-LABEL: @uremCase32BE +; IR-BE-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-BE-NORMAL-P9: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 1 +; IR-BE-NORMAL-P9-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7 +; IR-BE-NORMAL-P10: [[DIV:%[a-zA-Z_0-9-]+]] = urem <4 x i32> [[LOAD]], +; IR-BE-NORMAL-P10-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 1 +; Vector version: +; IR-BE-STRESS-COMMON: [[DIV:%[a-zA-Z_0-9-]+]] = urem <4 x i32> [[LOAD]], +; IR-BE-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[DIV]], i32 1 +; +; IR-BE-BOTH-COMMON: store i32 [[RES]], ptr %dest +; IR-BE-BOTH-COMMON-NEXT: ret define void @uremCase32BE(ptr %addr1, ptr %dest) { ; ASM-P9-BE-LABEL: uremCase32BE: ; ASM-P9-BE: # %bb.0: @@ -1665,6 +1918,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotionDouble +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x double>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[LOAD]], i32 1 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd double [[EXTRACT]], 1.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x double> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x double> [[DIV]], i32 1 +; +; IR-BOTH-COMMON-NEXT: store double [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotionDouble(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: simpleOneInstructionPromotionDouble: ; ASM-P8: # %bb.0: @@ -1728,6 +1992,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotionFloat +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x float>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[LOAD]], i32 2 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <4 x float> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[DIV]], i32 2 +; +; IR-BOTH-COMMON-NEXT: store float [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotionFloat(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: simpleOneInstructionPromotionFloat: ; ASM-P8: # %bb.0: @@ -1794,6 +2069,17 @@ ret void } +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotion64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 1 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i64 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[VECTOR_OR]], i32 1 +; +; IR-BOTH-COMMON-NEXT: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotion64(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: simpleOneInstructionPromotion64: ; ASM-P8: # %bb.0: @@ -1855,12 +2141,20 @@ ret void } +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotion32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], +; IR-BOTH-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 2 +; IR-BOTH-COMMON-NEXT: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotion32(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: simpleOneInstructionPromotion32: ; ASM-P8: # %bb.0: -; ASM-P8-NEXT: lwz 3, 8(3) -; ASM-P8-NEXT: ori 3, 3, 1 -; ASM-P8-NEXT: stw 3, 0(4) +; ASM-P8-NEXT: lxvd2x 0, 0, 3 +; ASM-P8-NEXT: vspltisw 2, 1 +; ASM-P8-NEXT: xxswapd 35, 0 +; ASM-P8-NEXT: xxlor 0, 35, 34 +; ASM-P8-NEXT: stfiwx 0, 0, 4 ; ASM-P8-NEXT: blr ; ; ASM-STRESS-P8-LABEL: simpleOneInstructionPromotion32: @@ -1874,9 +2168,10 @@ ; ; ASM-P9-LABEL: simpleOneInstructionPromotion32: ; ASM-P9: # %bb.0: -; ASM-P9-NEXT: lwz 3, 8(3) -; ASM-P9-NEXT: ori 3, 3, 1 -; ASM-P9-NEXT: stw 3, 0(4) +; ASM-P9-NEXT: lxv 0, 0(3) +; ASM-P9-NEXT: vspltisw 2, 1 +; ASM-P9-NEXT: xxlor 0, 0, 34 +; ASM-P9-NEXT: stfiwx 0, 0, 4 ; ASM-P9-NEXT: blr ; ; ASM-STRESS-P9-LABEL: simpleOneInstructionPromotion32: @@ -1889,9 +2184,10 @@ ; ; ASM-P10-LABEL: simpleOneInstructionPromotion32: ; ASM-P10: # %bb.0: -; ASM-P10-NEXT: lwz 3, 8(3) -; ASM-P10-NEXT: ori 3, 3, 1 -; ASM-P10-NEXT: stw 3, 0(4) +; ASM-P10-NEXT: lxv 0, 0(3) +; ASM-P10-NEXT: vspltisw 2, 1 +; ASM-P10-NEXT: xxlor 0, 0, 34 +; ASM-P10-NEXT: stfiwx 0, 0, 4 ; ASM-P10-NEXT: blr ; ; ASM-STRESS-P10-LABEL: simpleOneInstructionPromotion32: @@ -1908,6 +2204,21 @@ ret void } +; Check that we correctly use a splat constant when we cannot +; determine at compile time the index of the extract. +; This requires the STRESS modes, as variable index are expensive +; to lower. +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotionVariableIdx64 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i64>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[LOAD]], i32 %idx +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i64 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i64> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i64> [[OR]], i32 %idx +; +; IR-BOTH-COMMON-NEXT: store i64 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotionVariableIdx64(ptr %addr1, ptr %dest, i32 %idx) { ; ASM-P8-LABEL: simpleOneInstructionPromotionVariableIdx64: ; ASM-P8: # %bb.0: @@ -1990,6 +2301,21 @@ ret void } +; Check that we correctly use a splat constant when we cannot +; determine at compile time the index of the extract. +; This requires the STRESS modes, as variable index are expensive +; to lower. +; IR-BOTH-COMMON-LABEL: @simpleOneInstructionPromotionVariableIdx32 +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 %idx +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[OR]], i32 %idx +; +; IR-BOTH-COMMON-NEXT: store i32 [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @simpleOneInstructionPromotionVariableIdx32(ptr %addr1, ptr %dest, i32 %idx) { ; ASM-P8-LABEL: simpleOneInstructionPromotionVariableIdx32: ; ASM-P8: # %bb.0: @@ -2063,6 +2389,19 @@ ret void } +; Check that we use an undef mask for undefined behavior if the fast-math +; flag is set. +; IR-BOTH-COMMON-LABEL: @undefConstantFRemCaseWithFastMath +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x float>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[LOAD]], i32 2 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <4 x float> [[LOAD]], +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[DIV]], i32 2 +; +; IR-BOTH-COMMON-NEXT: store float [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefConstantFRemCaseWithFastMath: ; ASM-P8: # %bb.0: @@ -2205,6 +2544,19 @@ ret void } +; Check that we use an undef mask for undefined behavior if the fast-math +; flag is set. +; IR-BOTH-COMMON-LABEL: @undefVectorFRemCaseWithFastMath +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x float>, ptr %addr1 +; Scalar version: +; IR-NORMAL-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[LOAD]], i32 2 +; IR-NORMAL-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]] +; Vector version: +; IR-STRESS-COMMON-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <4 x float> , [[LOAD]] +; IR-STRESS-COMMON-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <4 x float> [[DIV]], i32 2 +; +; IR-BOTH-COMMON-NEXT: store float [[RES]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: undefVectorFRemCaseWithFastMath: ; ASM-P8: # %bb.0: @@ -2347,6 +2699,14 @@ ret void } +; IR-BOTH-COMMON-LABEL: @unsupportedChainInDifferentBBs +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-BOTH-COMMON-NEXT: br i1 %bool, label %bb2, label %end +; BB2 +; IR-BOTH-COMMON: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-BOTH-COMMON-NEXT: store i32 [[OR]], ptr %dest, align 4 +; IR-BOTH-COMMON: ret define void @unsupportedChainInDifferentBBs(ptr %addr1, ptr %dest, i1 %bool) { ; ASM-P8-LABEL: unsupportedChainInDifferentBBs: ; ASM-P8: # %bb.0: # %bb1 @@ -2419,6 +2779,12 @@ ret void } +; IR-BOTH-COMMON-LABEL: @unsupportedInstructionForPromotion +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-BOTH-COMMON-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2 +; IR-BOTH-COMMON-NEXT: store i1 [[CMP]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) { ; ASM-P8-LABEL: unsupportedInstructionForPromotion: ; ASM-P8: # %bb.0: @@ -2478,6 +2844,12 @@ ret void } +; IR-BOTH-COMMON-LABEL: @unsupportedMultiUses +; IR-BOTH-COMMON: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1 +; IR-BOTH-COMMON-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[LOAD]], i32 2 +; IR-BOTH-COMMON-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-BOTH-COMMON-NEXT: store i32 [[OR]], ptr %dest +; IR-BOTH-COMMON-NEXT: ret i32 [[OR]] define i32 @unsupportedMultiUses(ptr %addr1, ptr %dest) { ; ASM-P8-LABEL: unsupportedMultiUses: ; ASM-P8: # %bb.0: