diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
@@ -0,0 +1,1808 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of i8
+;;
+define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local signext i8 @v16i8_sign(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8_sign:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    extsb r3, r3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8_sign:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    extsb r3, r3
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8_sign:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    extsb r3, r3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8_sign:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    extsb r3, r3
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local zeroext i8 @v16i8_zero(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8_zero:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    clrldi r3, r3, 56
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8_zero:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    clrldi r3, r3, 56
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8_zero:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    clrldi r3, r3, 56
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8_zero:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    clrldi r3, r3, 56
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v32i8(<32 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vaddubm v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vaddubm v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vaddubm v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vaddubm v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
+  ret i8 %0
+}
+
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) #0
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) #0
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) #0
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #0
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) #0
+
+;;
+;; Vectors of i16
+;;
+define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local zeroext i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    clrldi r3, r3, 48
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    clrldi r3, r3, 48
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    clrldi r3, r3, 48
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    clrldi r3, r3, 48
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local signext i16 @v16i8tov16i16_sign(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i16_sign:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmrghb v3, v2, v2
+; PWR9LE-NEXT:    vspltish v4, 8
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmrglb v2, v2, v2
+; PWR9LE-NEXT:    vslh v3, v3, v4
+; PWR9LE-NEXT:    vslh v2, v2, v4
+; PWR9LE-NEXT:    vsrah v3, v3, v4
+; PWR9LE-NEXT:    vsrah v2, v2, v4
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    extsh r3, r3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i16_sign:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmrglb v3, v2, v2
+; PWR9BE-NEXT:    vspltish v4, 8
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmrghb v2, v2, v2
+; PWR9BE-NEXT:    vslh v3, v3, v4
+; PWR9BE-NEXT:    vslh v2, v2, v4
+; PWR9BE-NEXT:    vsrah v3, v3, v4
+; PWR9BE-NEXT:    vsrah v2, v2, v4
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    extsh r3, r3
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i16_sign:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmrghb v3, v2, v2
+; PWR10LE-NEXT:    xxspltiw v4, 524296
+; PWR10LE-NEXT:    vmrglb v2, v2, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vslh v3, v3, v4
+; PWR10LE-NEXT:    vslh v2, v2, v4
+; PWR10LE-NEXT:    vsrah v3, v3, v4
+; PWR10LE-NEXT:    vsrah v2, v2, v4
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    extsh r3, r3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i16_sign:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmrglb v3, v2, v2
+; PWR10BE-NEXT:    xxspltiw v4, 524296
+; PWR10BE-NEXT:    vmrghb v2, v2, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vslh v3, v3, v4
+; PWR10BE-NEXT:    vslh v2, v2, v4
+; PWR10BE-NEXT:    vsrah v3, v3, v4
+; PWR10BE-NEXT:    vsrah v2, v2, v4
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    extsh r3, r3
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = sext <16 x i8> %a to <16 x i16>
+  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0)
+  ret i16 %1
+}
+
+define dso_local zeroext i16 @v16i8tov16i16_zero(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i16_zero:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor v3, v3, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmrghb v4, v3, v2
+; PWR9LE-NEXT:    vmrglb v2, v3, v2
+; PWR9LE-NEXT:    vadduhm v2, v2, v4
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vadduhm v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    clrldi r3, r3, 48
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i16_zero:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor v3, v3, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmrglb v4, v3, v2
+; PWR9BE-NEXT:    vmrghb v2, v3, v2
+; PWR9BE-NEXT:    vadduhm v2, v2, v4
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vadduhm v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    clrldi r3, r3, 48
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i16_zero:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor v3, v3, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmrghb v4, v3, v2
+; PWR10LE-NEXT:    vmrglb v2, v3, v2
+; PWR10LE-NEXT:    vadduhm v2, v2, v4
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vadduhm v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    clrldi r3, r3, 48
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i16_zero:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor v3, v3, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmrglb v4, v3, v2
+; PWR10BE-NEXT:    vmrghb v2, v3, v2
+; PWR10BE-NEXT:    vadduhm v2, v2, v4
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vadduhm v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    clrldi r3, r3, 48
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = zext <16 x i8> %a to <16 x i16>
+  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0)
+  ret i16 %1
+}
+
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) #0
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) #0
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #0
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #0
+
+;;
+;; Vectors of i32
+;;
+define dso_local zeroext i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vadduwm v3, v3, v5
+; PWR9LE-NEXT:    vadduwm v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vadduwm v3, v3, v5
+; PWR9BE-NEXT:    vadduwm v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vadduwm v3, v3, v5
+; PWR10LE-NEXT:    vadduwm v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vadduwm v3, v3, v5
+; PWR10BE-NEXT:    vadduwm v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @v32i32(<32 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vadduwm v4, v4, v8
+; PWR9LE-NEXT:    vadduwm v2, v2, v6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vadduwm v5, v5, v9
+; PWR9LE-NEXT:    vadduwm v3, v3, v7
+; PWR9LE-NEXT:    vadduwm v3, v3, v5
+; PWR9LE-NEXT:    vadduwm v2, v2, v4
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vadduwm v4, v4, v8
+; PWR9BE-NEXT:    vadduwm v2, v2, v6
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vadduwm v5, v5, v9
+; PWR9BE-NEXT:    vadduwm v3, v3, v7
+; PWR9BE-NEXT:    vadduwm v3, v3, v5
+; PWR9BE-NEXT:    vadduwm v2, v2, v4
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vadduwm v4, v4, v8
+; PWR10LE-NEXT:    vadduwm v2, v2, v6
+; PWR10LE-NEXT:    vadduwm v5, v5, v9
+; PWR10LE-NEXT:    vadduwm v3, v3, v7
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vadduwm v3, v3, v5
+; PWR10LE-NEXT:    vadduwm v2, v2, v4
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vadduwm v4, v4, v8
+; PWR10BE-NEXT:    vadduwm v2, v2, v6
+; PWR10BE-NEXT:    vadduwm v5, v5, v9
+; PWR10BE-NEXT:    vadduwm v3, v3, v7
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduwm v3, v3, v5
+; PWR10BE-NEXT:    vadduwm v2, v2, v4
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local signext i32 @v16i8tov16i32_sign(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i32_sign:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    addis r3, r2, .LCPI17_0@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI17_0@toc@l
+; PWR9LE-NEXT:    lxv v3, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI17_1@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI17_1@toc@l
+; PWR9LE-NEXT:    lxv v4, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI17_2@toc@ha
+; PWR9LE-NEXT:    vperm v3, v2, v2, v3
+; PWR9LE-NEXT:    addi r3, r3, .LCPI17_2@toc@l
+; PWR9LE-NEXT:    lxv v5, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI17_3@toc@ha
+; PWR9LE-NEXT:    vextsb2w v3, v3
+; PWR9LE-NEXT:    vperm v4, v2, v2, v4
+; PWR9LE-NEXT:    addi r3, r3, .LCPI17_3@toc@l
+; PWR9LE-NEXT:    lxv v0, 0(r3)
+; PWR9LE-NEXT:    vextsb2w v4, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vperm v5, v2, v2, v5
+; PWR9LE-NEXT:    vadduwm v3, v4, v3
+; PWR9LE-NEXT:    vextsb2w v5, v5
+; PWR9LE-NEXT:    vperm v2, v2, v2, v0
+; PWR9LE-NEXT:    vextsb2w v2, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v5
+; PWR9LE-NEXT:    vadduwm v2, v3, v2
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    extsw r3, r3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i32_sign:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    addis r3, r2, .LCPI17_0@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI17_0@toc@l
+; PWR9BE-NEXT:    lxv v3, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI17_1@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI17_1@toc@l
+; PWR9BE-NEXT:    lxv v4, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI17_2@toc@ha
+; PWR9BE-NEXT:    vperm v3, v2, v2, v3
+; PWR9BE-NEXT:    addi r3, r3, .LCPI17_2@toc@l
+; PWR9BE-NEXT:    lxv v5, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI17_3@toc@ha
+; PWR9BE-NEXT:    vextsb2w v3, v3
+; PWR9BE-NEXT:    vperm v4, v2, v2, v4
+; PWR9BE-NEXT:    addi r3, r3, .LCPI17_3@toc@l
+; PWR9BE-NEXT:    lxv v0, 0(r3)
+; PWR9BE-NEXT:    vextsb2w v4, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vperm v5, v2, v2, v5
+; PWR9BE-NEXT:    vadduwm v3, v4, v3
+; PWR9BE-NEXT:    vextsb2w v5, v5
+; PWR9BE-NEXT:    vperm v2, v2, v2, v0
+; PWR9BE-NEXT:    vextsb2w v2, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v5
+; PWR9BE-NEXT:    vadduwm v2, v3, v2
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    extsw r3, r3
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i32_sign:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    plxv v3, .LCPI17_0@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v4, .LCPI17_1@PCREL(0), 1
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vperm v3, v2, v2, v3
+; PWR10LE-NEXT:    plxv v5, .LCPI17_2@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v0, .LCPI17_3@PCREL(0), 1
+; PWR10LE-NEXT:    vperm v4, v2, v2, v4
+; PWR10LE-NEXT:    vperm v5, v2, v2, v5
+; PWR10LE-NEXT:    vperm v2, v2, v2, v0
+; PWR10LE-NEXT:    vextsb2w v3, v3
+; PWR10LE-NEXT:    vextsb2w v4, v4
+; PWR10LE-NEXT:    vextsb2w v5, v5
+; PWR10LE-NEXT:    vextsb2w v2, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v5
+; PWR10LE-NEXT:    vadduwm v3, v4, v3
+; PWR10LE-NEXT:    vadduwm v2, v3, v2
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    extsw r3, r3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i32_sign:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    addis r3, r2, .LCPI17_0@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI17_0@toc@l
+; PWR10BE-NEXT:    lxv v3, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI17_1@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI17_1@toc@l
+; PWR10BE-NEXT:    lxv v4, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI17_2@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI17_2@toc@l
+; PWR10BE-NEXT:    vperm v3, v2, v2, v3
+; PWR10BE-NEXT:    lxv v5, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI17_3@toc@ha
+; PWR10BE-NEXT:    vextsb2w v3, v3
+; PWR10BE-NEXT:    addi r3, r3, .LCPI17_3@toc@l
+; PWR10BE-NEXT:    vperm v4, v2, v2, v4
+; PWR10BE-NEXT:    lxv v0, 0(r3)
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vextsb2w v4, v4
+; PWR10BE-NEXT:    vperm v5, v2, v2, v5
+; PWR10BE-NEXT:    vadduwm v3, v4, v3
+; PWR10BE-NEXT:    vextsb2w v5, v5
+; PWR10BE-NEXT:    vperm v2, v2, v2, v0
+; PWR10BE-NEXT:    vextsb2w v2, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v5
+; PWR10BE-NEXT:    vadduwm v2, v3, v2
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    extsw r3, r3
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = sext <16 x i8> %a to <16 x i32>
+  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
+  ret i32 %1
+}
+
+define dso_local zeroext i32 @v16i8tov16i32_zero(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i32_zero:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    addis r3, r2, .LCPI18_0@toc@ha
+; PWR9LE-NEXT:    xxlxor v4, v4, v4
+; PWR9LE-NEXT:    addi r3, r3, .LCPI18_0@toc@l
+; PWR9LE-NEXT:    lxv v3, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI18_1@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI18_1@toc@l
+; PWR9LE-NEXT:    lxv v5, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI18_2@toc@ha
+; PWR9LE-NEXT:    vperm v3, v4, v2, v3
+; PWR9LE-NEXT:    addi r3, r3, .LCPI18_2@toc@l
+; PWR9LE-NEXT:    lxv v0, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI18_3@toc@ha
+; PWR9LE-NEXT:    vperm v5, v4, v2, v5
+; PWR9LE-NEXT:    addi r3, r3, .LCPI18_3@toc@l
+; PWR9LE-NEXT:    lxv v1, 0(r3)
+; PWR9LE-NEXT:    vadduwm v3, v5, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vperm v0, v4, v2, v0
+; PWR9LE-NEXT:    vperm v2, v4, v2, v1
+; PWR9LE-NEXT:    vadduwm v2, v2, v0
+; PWR9LE-NEXT:    vadduwm v2, v3, v2
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vadduwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i32_zero:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    addis r3, r2, .LCPI18_0@toc@ha
+; PWR9BE-NEXT:    xxlxor v4, v4, v4
+; PWR9BE-NEXT:    addi r3, r3, .LCPI18_0@toc@l
+; PWR9BE-NEXT:    lxv v3, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI18_1@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI18_1@toc@l
+; PWR9BE-NEXT:    lxv v5, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI18_2@toc@ha
+; PWR9BE-NEXT:    vperm v3, v4, v2, v3
+; PWR9BE-NEXT:    addi r3, r3, .LCPI18_2@toc@l
+; PWR9BE-NEXT:    lxv v0, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI18_3@toc@ha
+; PWR9BE-NEXT:    vperm v5, v4, v2, v5
+; PWR9BE-NEXT:    addi r3, r3, .LCPI18_3@toc@l
+; PWR9BE-NEXT:    lxv v1, 0(r3)
+; PWR9BE-NEXT:    vadduwm v3, v5, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vperm v0, v4, v2, v0
+; PWR9BE-NEXT:    vperm v2, v4, v2, v1
+; PWR9BE-NEXT:    vadduwm v2, v2, v0
+; PWR9BE-NEXT:    vadduwm v2, v3, v2
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vadduwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i32_zero:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    plxv v3, .LCPI18_0@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v5, .LCPI18_1@PCREL(0), 1
+; PWR10LE-NEXT:    xxlxor v4, v4, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vperm v3, v4, v2, v3
+; PWR10LE-NEXT:    plxv v0, .LCPI18_2@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v1, .LCPI18_3@PCREL(0), 1
+; PWR10LE-NEXT:    vperm v5, v4, v2, v5
+; PWR10LE-NEXT:    vperm v0, v4, v2, v0
+; PWR10LE-NEXT:    vperm v2, v4, v2, v1
+; PWR10LE-NEXT:    vadduwm v2, v2, v0
+; PWR10LE-NEXT:    vadduwm v3, v5, v3
+; PWR10LE-NEXT:    vadduwm v2, v3, v2
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vadduwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i32_zero:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    addis r3, r2, .LCPI18_0@toc@ha
+; PWR10BE-NEXT:    xxlxor v4, v4, v4
+; PWR10BE-NEXT:    addi r3, r3, .LCPI18_0@toc@l
+; PWR10BE-NEXT:    lxv v3, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI18_1@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI18_1@toc@l
+; PWR10BE-NEXT:    lxv v5, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI18_2@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI18_2@toc@l
+; PWR10BE-NEXT:    vperm v3, v4, v2, v3
+; PWR10BE-NEXT:    lxv v0, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI18_3@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI18_3@toc@l
+; PWR10BE-NEXT:    vperm v5, v4, v2, v5
+; PWR10BE-NEXT:    lxv v1, 0(r3)
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vadduwm v3, v5, v3
+; PWR10BE-NEXT:    vperm v0, v4, v2, v0
+; PWR10BE-NEXT:    vperm v2, v4, v2, v1
+; PWR10BE-NEXT:    vadduwm v2, v2, v0
+; PWR10BE-NEXT:    vadduwm v2, v3, v2
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vadduwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = zext <16 x i8> %a to <16 x i32>
+  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #0
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) #0
+
+;;
+;; Vectors of i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vaddudm v3, v3, v5
+; PWR9LE-NEXT:    vaddudm v2, v2, v4
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vaddudm v3, v3, v5
+; PWR9BE-NEXT:    vaddudm v2, v2, v4
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vaddudm v3, v3, v5
+; PWR10LE-NEXT:    vaddudm v2, v2, v4
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vaddudm v3, v3, v5
+; PWR10BE-NEXT:    vaddudm v2, v2, v4
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vaddudm v4, v4, v8
+; PWR9LE-NEXT:    vaddudm v2, v2, v6
+; PWR9LE-NEXT:    vaddudm v5, v5, v9
+; PWR9LE-NEXT:    vaddudm v3, v3, v7
+; PWR9LE-NEXT:    vaddudm v3, v3, v5
+; PWR9LE-NEXT:    vaddudm v2, v2, v4
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vaddudm v4, v4, v8
+; PWR9BE-NEXT:    vaddudm v2, v2, v6
+; PWR9BE-NEXT:    vaddudm v5, v5, v9
+; PWR9BE-NEXT:    vaddudm v3, v3, v7
+; PWR9BE-NEXT:    vaddudm v3, v3, v5
+; PWR9BE-NEXT:    vaddudm v2, v2, v4
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vaddudm v4, v4, v8
+; PWR10LE-NEXT:    vaddudm v5, v5, v9
+; PWR10LE-NEXT:    vaddudm v3, v3, v7
+; PWR10LE-NEXT:    vaddudm v3, v3, v5
+; PWR10LE-NEXT:    vaddudm v2, v2, v6
+; PWR10LE-NEXT:    vaddudm v2, v2, v4
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vaddudm v4, v4, v8
+; PWR10BE-NEXT:    vaddudm v5, v5, v9
+; PWR10BE-NEXT:    vaddudm v3, v3, v7
+; PWR10BE-NEXT:    vaddudm v3, v3, v5
+; PWR10BE-NEXT:    vaddudm v2, v2, v6
+; PWR10BE-NEXT:    vaddudm v2, v2, v4
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i64_sign:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_0@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_0@toc@l
+; PWR9LE-NEXT:    lxv v3, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_1@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_1@toc@l
+; PWR9LE-NEXT:    lxv v4, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_2@toc@ha
+; PWR9LE-NEXT:    vperm v3, v2, v2, v3
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_2@toc@l
+; PWR9LE-NEXT:    lxv v5, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_3@toc@ha
+; PWR9LE-NEXT:    vextsb2d v3, v3
+; PWR9LE-NEXT:    vperm v4, v2, v2, v4
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_3@toc@l
+; PWR9LE-NEXT:    lxv v0, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_4@toc@ha
+; PWR9LE-NEXT:    vextsb2d v4, v4
+; PWR9LE-NEXT:    vperm v5, v2, v2, v5
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_4@toc@l
+; PWR9LE-NEXT:    vaddudm v3, v4, v3
+; PWR9LE-NEXT:    lxv v1, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_5@toc@ha
+; PWR9LE-NEXT:    vextsb2d v5, v5
+; PWR9LE-NEXT:    vperm v0, v2, v2, v0
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_5@toc@l
+; PWR9LE-NEXT:    lxv v6, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_6@toc@ha
+; PWR9LE-NEXT:    vperm v1, v2, v2, v1
+; PWR9LE-NEXT:    vextsb2d v0, v0
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_6@toc@l
+; PWR9LE-NEXT:    vaddudm v5, v0, v5
+; PWR9LE-NEXT:    lxv v7, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI23_7@toc@ha
+; PWR9LE-NEXT:    vperm v6, v2, v2, v6
+; PWR9LE-NEXT:    vextsb2d v1, v1
+; PWR9LE-NEXT:    vaddudm v3, v3, v5
+; PWR9LE-NEXT:    addi r3, r3, .LCPI23_7@toc@l
+; PWR9LE-NEXT:    lxv v8, 0(r3)
+; PWR9LE-NEXT:    vextsb2d v6, v6
+; PWR9LE-NEXT:    vperm v7, v2, v2, v7
+; PWR9LE-NEXT:    vaddudm v1, v6, v1
+; PWR9LE-NEXT:    vextsb2d v7, v7
+; PWR9LE-NEXT:    vperm v2, v2, v2, v8
+; PWR9LE-NEXT:    vextsb2d v2, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v7
+; PWR9LE-NEXT:    vaddudm v2, v1, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i64_sign:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_0@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_0@toc@l
+; PWR9BE-NEXT:    lxv v3, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_1@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_1@toc@l
+; PWR9BE-NEXT:    lxv v4, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_2@toc@ha
+; PWR9BE-NEXT:    vperm v3, v2, v2, v3
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_2@toc@l
+; PWR9BE-NEXT:    lxv v5, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_3@toc@ha
+; PWR9BE-NEXT:    vextsb2d v3, v3
+; PWR9BE-NEXT:    vperm v4, v2, v2, v4
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_3@toc@l
+; PWR9BE-NEXT:    lxv v0, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_4@toc@ha
+; PWR9BE-NEXT:    vextsb2d v4, v4
+; PWR9BE-NEXT:    vperm v5, v2, v2, v5
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_4@toc@l
+; PWR9BE-NEXT:    vaddudm v3, v4, v3
+; PWR9BE-NEXT:    lxv v1, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_5@toc@ha
+; PWR9BE-NEXT:    vextsb2d v5, v5
+; PWR9BE-NEXT:    vperm v0, v2, v2, v0
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_5@toc@l
+; PWR9BE-NEXT:    lxv v6, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_6@toc@ha
+; PWR9BE-NEXT:    vperm v1, v2, v2, v1
+; PWR9BE-NEXT:    vextsb2d v0, v0
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_6@toc@l
+; PWR9BE-NEXT:    vaddudm v5, v0, v5
+; PWR9BE-NEXT:    lxv v7, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI23_7@toc@ha
+; PWR9BE-NEXT:    vperm v6, v2, v2, v6
+; PWR9BE-NEXT:    vextsb2d v1, v1
+; PWR9BE-NEXT:    vaddudm v3, v3, v5
+; PWR9BE-NEXT:    addi r3, r3, .LCPI23_7@toc@l
+; PWR9BE-NEXT:    lxv v8, 0(r3)
+; PWR9BE-NEXT:    vextsb2d v6, v6
+; PWR9BE-NEXT:    vperm v7, v2, v2, v7
+; PWR9BE-NEXT:    vaddudm v1, v6, v1
+; PWR9BE-NEXT:    vextsb2d v7, v7
+; PWR9BE-NEXT:    vperm v2, v2, v2, v8
+; PWR9BE-NEXT:    vextsb2d v2, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v7
+; PWR9BE-NEXT:    vaddudm v2, v1, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i64_sign:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    plxv v3, .LCPI23_0@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v4, .LCPI23_1@PCREL(0), 1
+; PWR10LE-NEXT:    vperm v3, v2, v2, v3
+; PWR10LE-NEXT:    plxv v5, .LCPI23_2@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v0, .LCPI23_3@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v1, .LCPI23_4@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v6, .LCPI23_5@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v7, .LCPI23_6@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v8, .LCPI23_7@PCREL(0), 1
+; PWR10LE-NEXT:    vperm v4, v2, v2, v4
+; PWR10LE-NEXT:    vperm v5, v2, v2, v5
+; PWR10LE-NEXT:    vperm v0, v2, v2, v0
+; PWR10LE-NEXT:    vperm v1, v2, v2, v1
+; PWR10LE-NEXT:    vperm v6, v2, v2, v6
+; PWR10LE-NEXT:    vperm v7, v2, v2, v7
+; PWR10LE-NEXT:    vperm v2, v2, v2, v8
+; PWR10LE-NEXT:    vextsb2d v5, v5
+; PWR10LE-NEXT:    vextsb2d v0, v0
+; PWR10LE-NEXT:    vextsb2d v7, v7
+; PWR10LE-NEXT:    vextsb2d v2, v2
+; PWR10LE-NEXT:    vextsb2d v3, v3
+; PWR10LE-NEXT:    vextsb2d v4, v4
+; PWR10LE-NEXT:    vextsb2d v1, v1
+; PWR10LE-NEXT:    vextsb2d v6, v6
+; PWR10LE-NEXT:    vaddudm v2, v2, v7
+; PWR10LE-NEXT:    vaddudm v5, v0, v5
+; PWR10LE-NEXT:    vaddudm v3, v4, v3
+; PWR10LE-NEXT:    vaddudm v3, v3, v5
+; PWR10LE-NEXT:    vaddudm v4, v6, v1
+; PWR10LE-NEXT:    vaddudm v2, v4, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i64_sign:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_0@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_0@toc@l
+; PWR10BE-NEXT:    lxv v3, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_1@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_1@toc@l
+; PWR10BE-NEXT:    lxv v4, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_2@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_2@toc@l
+; PWR10BE-NEXT:    vperm v3, v2, v2, v3
+; PWR10BE-NEXT:    lxv v5, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_3@toc@ha
+; PWR10BE-NEXT:    vextsb2d v3, v3
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_3@toc@l
+; PWR10BE-NEXT:    vperm v4, v2, v2, v4
+; PWR10BE-NEXT:    lxv v0, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_4@toc@ha
+; PWR10BE-NEXT:    vextsb2d v4, v4
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_4@toc@l
+; PWR10BE-NEXT:    vperm v5, v2, v2, v5
+; PWR10BE-NEXT:    lxv v1, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_5@toc@ha
+; PWR10BE-NEXT:    vextsb2d v5, v5
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_5@toc@l
+; PWR10BE-NEXT:    vperm v0, v2, v2, v0
+; PWR10BE-NEXT:    lxv v6, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_6@toc@ha
+; PWR10BE-NEXT:    vextsb2d v0, v0
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_6@toc@l
+; PWR10BE-NEXT:    vperm v1, v2, v2, v1
+; PWR10BE-NEXT:    vaddudm v5, v0, v5
+; PWR10BE-NEXT:    vaddudm v3, v4, v3
+; PWR10BE-NEXT:    vaddudm v3, v3, v5
+; PWR10BE-NEXT:    lxv v7, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI23_7@toc@ha
+; PWR10BE-NEXT:    vextsb2d v1, v1
+; PWR10BE-NEXT:    addi r3, r3, .LCPI23_7@toc@l
+; PWR10BE-NEXT:    vperm v6, v2, v2, v6
+; PWR10BE-NEXT:    lxv v8, 0(r3)
+; PWR10BE-NEXT:    vextsb2d v6, v6
+; PWR10BE-NEXT:    vperm v7, v2, v2, v7
+; PWR10BE-NEXT:    vextsb2d v7, v7
+; PWR10BE-NEXT:    vperm v2, v2, v2, v8
+; PWR10BE-NEXT:    vextsb2d v2, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v7
+; PWR10BE-NEXT:    vaddudm v4, v6, v1
+; PWR10BE-NEXT:    vaddudm v2, v4, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = sext <16 x i8> %a to <16 x i64>
+  %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %0)
+  ret i64 %1
+}
+
+define dso_local i64 @v16i8tov16i64_zero(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8tov16i64_zero:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_0@toc@ha
+; PWR9LE-NEXT:    xxlxor v4, v4, v4
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_0@toc@l
+; PWR9LE-NEXT:    lxv v3, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_1@toc@ha
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_1@toc@l
+; PWR9LE-NEXT:    lxv v5, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_2@toc@ha
+; PWR9LE-NEXT:    vperm v3, v4, v2, v3
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_2@toc@l
+; PWR9LE-NEXT:    lxv v0, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_3@toc@ha
+; PWR9LE-NEXT:    vperm v5, v4, v2, v5
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_3@toc@l
+; PWR9LE-NEXT:    lxv v1, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_4@toc@ha
+; PWR9LE-NEXT:    vaddudm v3, v5, v3
+; PWR9LE-NEXT:    vperm v0, v4, v2, v0
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_4@toc@l
+; PWR9LE-NEXT:    lxv v6, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_5@toc@ha
+; PWR9LE-NEXT:    vperm v1, v4, v2, v1
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_5@toc@l
+; PWR9LE-NEXT:    lxv v7, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_6@toc@ha
+; PWR9LE-NEXT:    vaddudm v0, v1, v0
+; PWR9LE-NEXT:    vperm v6, v4, v2, v6
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_6@toc@l
+; PWR9LE-NEXT:    lxv v8, 0(r3)
+; PWR9LE-NEXT:    addis r3, r2, .LCPI24_7@toc@ha
+; PWR9LE-NEXT:    vaddudm v3, v3, v0
+; PWR9LE-NEXT:    vperm v7, v4, v2, v7
+; PWR9LE-NEXT:    addi r3, r3, .LCPI24_7@toc@l
+; PWR9LE-NEXT:    lxv v9, 0(r3)
+; PWR9LE-NEXT:    vperm v8, v4, v2, v8
+; PWR9LE-NEXT:    vperm v2, v4, v2, v9
+; PWR9LE-NEXT:    vaddudm v4, v7, v6
+; PWR9LE-NEXT:    vaddudm v2, v2, v8
+; PWR9LE-NEXT:    vaddudm v2, v4, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vaddudm v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8tov16i64_zero:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_0@toc@ha
+; PWR9BE-NEXT:    xxlxor v4, v4, v4
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_0@toc@l
+; PWR9BE-NEXT:    lxv v3, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_1@toc@ha
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_1@toc@l
+; PWR9BE-NEXT:    lxv v5, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_2@toc@ha
+; PWR9BE-NEXT:    vperm v3, v4, v2, v3
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_2@toc@l
+; PWR9BE-NEXT:    lxv v0, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_3@toc@ha
+; PWR9BE-NEXT:    vperm v5, v4, v2, v5
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_3@toc@l
+; PWR9BE-NEXT:    lxv v1, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_4@toc@ha
+; PWR9BE-NEXT:    vaddudm v3, v5, v3
+; PWR9BE-NEXT:    vperm v0, v4, v2, v0
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_4@toc@l
+; PWR9BE-NEXT:    lxv v6, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_5@toc@ha
+; PWR9BE-NEXT:    vperm v1, v4, v2, v1
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_5@toc@l
+; PWR9BE-NEXT:    lxv v7, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_6@toc@ha
+; PWR9BE-NEXT:    vaddudm v0, v1, v0
+; PWR9BE-NEXT:    vperm v6, v4, v2, v6
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_6@toc@l
+; PWR9BE-NEXT:    lxv v8, 0(r3)
+; PWR9BE-NEXT:    addis r3, r2, .LCPI24_7@toc@ha
+; PWR9BE-NEXT:    vaddudm v3, v3, v0
+; PWR9BE-NEXT:    vperm v7, v4, v2, v7
+; PWR9BE-NEXT:    addi r3, r3, .LCPI24_7@toc@l
+; PWR9BE-NEXT:    lxv v9, 0(r3)
+; PWR9BE-NEXT:    vperm v8, v4, v2, v8
+; PWR9BE-NEXT:    vperm v2, v4, v2, v9
+; PWR9BE-NEXT:    vaddudm v4, v7, v6
+; PWR9BE-NEXT:    vaddudm v2, v2, v8
+; PWR9BE-NEXT:    vaddudm v2, v4, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vaddudm v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8tov16i64_zero:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    plxv v3, .LCPI24_0@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v5, .LCPI24_1@PCREL(0), 1
+; PWR10LE-NEXT:    xxlxor v4, v4, v4
+; PWR10LE-NEXT:    vperm v3, v4, v2, v3
+; PWR10LE-NEXT:    plxv v0, .LCPI24_2@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v1, .LCPI24_3@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v6, .LCPI24_4@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v7, .LCPI24_5@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v8, .LCPI24_6@PCREL(0), 1
+; PWR10LE-NEXT:    plxv v9, .LCPI24_7@PCREL(0), 1
+; PWR10LE-NEXT:    vperm v5, v4, v2, v5
+; PWR10LE-NEXT:    vperm v0, v4, v2, v0
+; PWR10LE-NEXT:    vperm v1, v4, v2, v1
+; PWR10LE-NEXT:    vperm v6, v4, v2, v6
+; PWR10LE-NEXT:    vperm v7, v4, v2, v7
+; PWR10LE-NEXT:    vperm v8, v4, v2, v8
+; PWR10LE-NEXT:    vperm v2, v4, v2, v9
+; PWR10LE-NEXT:    vaddudm v2, v2, v8
+; PWR10LE-NEXT:    vaddudm v4, v1, v0
+; PWR10LE-NEXT:    vaddudm v3, v5, v3
+; PWR10LE-NEXT:    vaddudm v3, v3, v4
+; PWR10LE-NEXT:    vaddudm v4, v7, v6
+; PWR10LE-NEXT:    vaddudm v2, v4, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vaddudm v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8tov16i64_zero:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_0@toc@ha
+; PWR10BE-NEXT:    xxlxor v4, v4, v4
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_0@toc@l
+; PWR10BE-NEXT:    lxv v3, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_1@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_1@toc@l
+; PWR10BE-NEXT:    lxv v5, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_2@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_2@toc@l
+; PWR10BE-NEXT:    vperm v3, v4, v2, v3
+; PWR10BE-NEXT:    lxv v0, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_3@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_3@toc@l
+; PWR10BE-NEXT:    vperm v5, v4, v2, v5
+; PWR10BE-NEXT:    lxv v1, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_4@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_4@toc@l
+; PWR10BE-NEXT:    vperm v0, v4, v2, v0
+; PWR10BE-NEXT:    lxv v6, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_5@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_5@toc@l
+; PWR10BE-NEXT:    vperm v1, v4, v2, v1
+; PWR10BE-NEXT:    lxv v7, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_6@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_6@toc@l
+; PWR10BE-NEXT:    vperm v6, v4, v2, v6
+; PWR10BE-NEXT:    lxv v8, 0(r3)
+; PWR10BE-NEXT:    addis r3, r2, .LCPI24_7@toc@ha
+; PWR10BE-NEXT:    addi r3, r3, .LCPI24_7@toc@l
+; PWR10BE-NEXT:    vperm v7, v4, v2, v7
+; PWR10BE-NEXT:    lxv v9, 0(r3)
+; PWR10BE-NEXT:    vperm v8, v4, v2, v8
+; PWR10BE-NEXT:    vperm v2, v4, v2, v9
+; PWR10BE-NEXT:    vaddudm v4, v1, v0
+; PWR10BE-NEXT:    vaddudm v3, v5, v3
+; PWR10BE-NEXT:    vaddudm v3, v3, v4
+; PWR10BE-NEXT:    vaddudm v2, v2, v8
+; PWR10BE-NEXT:    vaddudm v4, v7, v6
+; PWR10BE-NEXT:    vaddudm v2, v4, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vaddudm v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = zext <16 x i8> %a to <16 x i64>
+  %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %0)
+  ret i64 %1
+}
+
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxland v2, v2, vs0
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxland v2, v2, vs0
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxland v2, v2, vs0
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxland v2, v2, vs0
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxland vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxland v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxland vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxland v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxland vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs0, vs0, 2
+; PWR10LE-NEXT:    xxeval v2, v2, v3, vs0, 1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxland vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs0, vs0, 1
+; PWR10BE-NEXT:    xxeval v2, v2, v3, vs0, 1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxland vs0, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxland v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxland vs0, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxland v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxland vs0, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v4, vs0
+; PWR10LE-NEXT:    xxeval vs1, v2, v3, v4, 1
+; PWR10LE-NEXT:    xxspltw vs1, vs1, 2
+; PWR10LE-NEXT:    xxeval v2, vs0, v4, vs1, 1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxland vs0, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v4, vs0
+; PWR10BE-NEXT:    xxeval vs1, v2, v3, v4, 1
+; PWR10BE-NEXT:    xxspltw vs1, vs1, 1
+; PWR10BE-NEXT:    xxeval v2, vs0, v4, vs1, 1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxland vs0, v3, v5
+; PWR9LE-NEXT:    xxland vs1, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxland vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxland v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxland vs0, v3, v5
+; PWR9BE-NEXT:    xxland vs1, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxland vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxland v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxland vs1, v2, v4
+; PWR10LE-NEXT:    xxland vs0, v3, v5
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxeval vs2, vs1, v3, v5, 1
+; PWR10LE-NEXT:    xxswapd v2, vs2
+; PWR10LE-NEXT:    xxeval vs0, vs1, vs0, v2, 1
+; PWR10LE-NEXT:    xxspltw vs0, vs0, 2
+; PWR10LE-NEXT:    xxeval v2, vs2, v2, vs0, 1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxland vs1, v2, v4
+; PWR10BE-NEXT:    xxland vs0, v3, v5
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxeval vs2, vs1, v3, v5, 1
+; PWR10BE-NEXT:    xxswapd v2, vs2
+; PWR10BE-NEXT:    xxeval vs0, vs1, vs0, v2, 1
+; PWR10BE-NEXT:    xxspltw vs0, vs0, 1
+; PWR10BE-NEXT:    xxeval v2, vs2, v2, vs0, 1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xxland vs0, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xxland vs0, v2, v3
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xxland vs0, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xxland vs0, v2, v3
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxland vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxland vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxland vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v4, vs0
+; PWR10LE-NEXT:    xxeval vs0, v2, v3, v4, 1
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxland vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v4, vs0
+; PWR10BE-NEXT:    xxeval vs0, v2, v3, v4, 1
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxland vs0, v3, v5
+; PWR9LE-NEXT:    xxland vs1, v2, v4
+; PWR9LE-NEXT:    xxland vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxland vs0, v3, v5
+; PWR9BE-NEXT:    xxland vs1, v2, v4
+; PWR9BE-NEXT:    xxland vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxland vs1, v2, v4
+; PWR10LE-NEXT:    xxland vs0, v3, v5
+; PWR10LE-NEXT:    xxeval vs2, vs1, v3, v5, 1
+; PWR10LE-NEXT:    xxswapd v2, vs2
+; PWR10LE-NEXT:    xxeval vs0, vs1, vs0, v2, 1
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxland vs1, v2, v4
+; PWR10BE-NEXT:    xxland vs0, v3, v5
+; PWR10BE-NEXT:    xxeval vs2, vs1, v3, v5, 1
+; PWR10BE-NEXT:    xxswapd v2, vs2
+; PWR10BE-NEXT:    xxeval vs0, vs1, vs0, v2, 1
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxland vs0, v4, v8
+; PWR9LE-NEXT:    xxland vs1, v2, v6
+; PWR9LE-NEXT:    xxland vs2, v5, v9
+; PWR9LE-NEXT:    xxland vs3, v3, v7
+; PWR9LE-NEXT:    xxland vs2, vs3, vs2
+; PWR9LE-NEXT:    xxland vs0, vs1, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxland vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxland vs0, v4, v8
+; PWR9BE-NEXT:    xxland vs1, v2, v6
+; PWR9BE-NEXT:    xxland vs2, v5, v9
+; PWR9BE-NEXT:    xxland vs3, v3, v7
+; PWR9BE-NEXT:    xxland vs2, vs3, vs2
+; PWR9BE-NEXT:    xxland vs0, vs1, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxland vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxland vs1, v2, v6
+; PWR10LE-NEXT:    xxland vs0, v5, v9
+; PWR10LE-NEXT:    xxland vs2, v3, v7
+; PWR10LE-NEXT:    xxeval vs1, vs1, v4, v8, 1
+; PWR10LE-NEXT:    xxeval vs3, vs2, v5, v9, 1
+; PWR10LE-NEXT:    xxeval vs0, vs1, vs2, vs0, 1
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxeval vs0, vs1, vs3, v2, 1
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxland vs1, v2, v6
+; PWR10BE-NEXT:    xxland vs0, v5, v9
+; PWR10BE-NEXT:    xxland vs2, v3, v7
+; PWR10BE-NEXT:    xxeval vs1, vs1, v4, v8, 1
+; PWR10BE-NEXT:    xxeval vs3, vs2, v5, v9, 1
+; PWR10BE-NEXT:    xxeval vs0, vs1, vs2, vs0, 1
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxeval vs0, vs1, vs3, v2, 1
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -0,0 +1,4247 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mattr=-paired-vector-memops -mcpu=pwr10 -mtriple=powerpc64le < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mattr=-paired-vector-memops -mcpu=pwr10 -mtriple=powerpc64 < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of f32
+;;
+define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_b(<2 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsaddsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsaddsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsaddsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsaddsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v2f32(float %b, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    xvaddsp vs0, v2, vs0
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    xvaddsp vs0, v2, vs0
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    xvaddsp vs0, v2, vs0
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    xvaddsp vs0, v2, vs0
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_b(<4 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsaddsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsaddsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsaddsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsaddsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v4f32(float %b, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xvaddsp vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xvaddsp vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xvaddsp vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xvaddsp vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_b(<8 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsaddsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsaddsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsaddsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsaddsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v8f32(float %b, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvaddsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvaddsp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvaddsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvaddsp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v4
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v5
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v4
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v5
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v4
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v5
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v4
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v5
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_b(<16 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsaddsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v4
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsaddsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v5
+; PWR9LE-NEXT:    xsaddsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsaddsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v4
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v5
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsaddsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsaddsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v4
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsaddsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v5
+; PWR10LE-NEXT:    xsaddsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsaddsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v4
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v5
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsaddsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fadd.v16f32(float %b, <16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvaddsp vs0, v3, v5
+; PWR9LE-NEXT:    xvaddsp vs1, v2, v4
+; PWR9LE-NEXT:    xvaddsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvaddsp vs0, v3, v5
+; PWR9BE-NEXT:    xvaddsp vs1, v2, v4
+; PWR9BE-NEXT:    xvaddsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvaddsp vs0, v3, v5
+; PWR10LE-NEXT:    xvaddsp vs1, v2, v4
+; PWR10LE-NEXT:    xvaddsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvaddsp vs0, v3, v5
+; PWR10BE-NEXT:    xvaddsp vs1, v2, v4
+; PWR10BE-NEXT:    xvaddsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvaddsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvaddsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %a)
+  ret float %0
+}
+
+declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) #0
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) #0
+declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) #0
+
+;;
+;; Vectors of f64
+;;
+define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsadddp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xsadddp f1, v2, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsadddp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xsadddp f1, v2, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_b(<2 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsadddp f0, f1, f0
+; PWR9LE-NEXT:    xsadddp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsadddp f0, f1, f0
+; PWR10LE-NEXT:    xsadddp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v2f64(double %b, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xvadddp vs0, v2, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xvadddp vs1, v2, vs0
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xvadddp vs0, v2, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xvadddp vs1, v2, vs0
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, v2, f0
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, v2, f0
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_b(<4 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsadddp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsadddp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v4f64(double %b, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvadddp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvadddp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvadddp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvadddp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v5
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, v2, f0
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, v5
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v5
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, v2, f0
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, v5
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_b(<8 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsadddp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v5
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, v5
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsadddp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v5
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, v5
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v8f64(double %b, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvadddp vs0, v3, v5
+; PWR9LE-NEXT:    xvadddp vs1, v2, v4
+; PWR9LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvadddp vs0, v3, v5
+; PWR9BE-NEXT:    xvadddp vs1, v2, v4
+; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvadddp vs0, v3, v5
+; PWR10LE-NEXT:    xvadddp vs1, v2, v4
+; PWR10LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvadddp vs0, v3, v5
+; PWR10BE-NEXT:    xvadddp vs1, v2, v4
+; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v6
+; PWR9LE-NEXT:    xsadddp f0, f0, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v7
+; PWR9LE-NEXT:    xsadddp f0, f0, v6
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v8
+; PWR9LE-NEXT:    xsadddp f0, f0, v7
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v9
+; PWR9LE-NEXT:    xsadddp f0, f0, v8
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v9
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, v2, f0
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v6
+; PWR9BE-NEXT:    xsadddp f0, f0, v6
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v7
+; PWR9BE-NEXT:    xsadddp f0, f0, v7
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v8
+; PWR9BE-NEXT:    xsadddp f0, f0, v8
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v9
+; PWR9BE-NEXT:    xsadddp f0, f0, v9
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v6
+; PWR10LE-NEXT:    xsadddp f0, f0, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v7
+; PWR10LE-NEXT:    xsadddp f0, f0, v6
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v8
+; PWR10LE-NEXT:    xsadddp f0, f0, v7
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v9
+; PWR10LE-NEXT:    xsadddp f0, f0, v8
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v9
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, v2, f0
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v6
+; PWR10BE-NEXT:    xsadddp f0, f0, v6
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v7
+; PWR10BE-NEXT:    xsadddp f0, f0, v7
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v8
+; PWR10BE-NEXT:    xsadddp f0, f0, v8
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v9
+; PWR10BE-NEXT:    xsadddp f0, f0, v9
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_b(<16 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsadddp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, v2
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, v3
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, v4
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v6
+; PWR9LE-NEXT:    xsadddp f0, f0, v5
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v7
+; PWR9LE-NEXT:    xsadddp f0, f0, v6
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v8
+; PWR9LE-NEXT:    xsadddp f0, f0, v7
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v9
+; PWR9LE-NEXT:    xsadddp f0, f0, v8
+; PWR9LE-NEXT:    xsadddp f0, f0, f1
+; PWR9LE-NEXT:    xsadddp f1, f0, v9
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, v3
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, v4
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, v5
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v6
+; PWR9BE-NEXT:    xsadddp f0, f0, v6
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v7
+; PWR9BE-NEXT:    xsadddp f0, f0, v7
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v8
+; PWR9BE-NEXT:    xsadddp f0, f0, v8
+; PWR9BE-NEXT:    xsadddp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v9
+; PWR9BE-NEXT:    xsadddp f0, f0, v9
+; PWR9BE-NEXT:    xsadddp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsadddp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, v2
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, v3
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, v4
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v6
+; PWR10LE-NEXT:    xsadddp f0, f0, v5
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v7
+; PWR10LE-NEXT:    xsadddp f0, f0, v6
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v8
+; PWR10LE-NEXT:    xsadddp f0, f0, v7
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v9
+; PWR10LE-NEXT:    xsadddp f0, f0, v8
+; PWR10LE-NEXT:    xsadddp f0, f0, f1
+; PWR10LE-NEXT:    xsadddp f1, f0, v9
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, v3
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, v4
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, v5
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v6
+; PWR10BE-NEXT:    xsadddp f0, f0, v6
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v7
+; PWR10BE-NEXT:    xsadddp f0, f0, v7
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v8
+; PWR10BE-NEXT:    xsadddp f0, f0, v8
+; PWR10BE-NEXT:    xsadddp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v9
+; PWR10BE-NEXT:    xsadddp f0, f0, v9
+; PWR10BE-NEXT:    xsadddp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v16f64(double %b, <16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvadddp vs0, v4, v8
+; PWR9LE-NEXT:    xvadddp vs1, v2, v6
+; PWR9LE-NEXT:    xvadddp vs2, v5, v9
+; PWR9LE-NEXT:    xvadddp vs3, v3, v7
+; PWR9LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvadddp vs0, v4, v8
+; PWR9BE-NEXT:    xvadddp vs1, v2, v6
+; PWR9BE-NEXT:    xvadddp vs2, v5, v9
+; PWR9BE-NEXT:    xvadddp vs3, v3, v7
+; PWR9BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvadddp vs0, v4, v8
+; PWR10LE-NEXT:    xvadddp vs1, v2, v6
+; PWR10LE-NEXT:    xvadddp vs2, v5, v9
+; PWR10LE-NEXT:    xvadddp vs3, v3, v7
+; PWR10LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvadddp vs0, v4, v8
+; PWR10BE-NEXT:    xvadddp vs1, v2, v6
+; PWR10BE-NEXT:    xvadddp vs2, v5, v9
+; PWR10BE-NEXT:    xvadddp vs3, v3, v7
+; PWR10BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs4, v2
+; PWR9LE-NEXT:    xxswapd vs5, v3
+; PWR9LE-NEXT:    lxv vs3, 224(r1)
+; PWR9LE-NEXT:    lxv vs2, 240(r1)
+; PWR9LE-NEXT:    lxv vs1, 256(r1)
+; PWR9LE-NEXT:    lxv vs0, 272(r1)
+; PWR9LE-NEXT:    xsadddp f4, f4, v2
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v4
+; PWR9LE-NEXT:    xsadddp f4, f4, v3
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v5
+; PWR9LE-NEXT:    xsadddp f4, f4, v4
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v6
+; PWR9LE-NEXT:    xsadddp f4, f4, v5
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v7
+; PWR9LE-NEXT:    xsadddp f4, f4, v6
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v8
+; PWR9LE-NEXT:    xsadddp f4, f4, v7
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v9
+; PWR9LE-NEXT:    xsadddp f4, f4, v8
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v10
+; PWR9LE-NEXT:    xsadddp f4, f4, v9
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v11
+; PWR9LE-NEXT:    xsadddp f4, f4, v10
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v12
+; PWR9LE-NEXT:    xsadddp f4, f4, v11
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, v13
+; PWR9LE-NEXT:    xsadddp f4, f4, v12
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xxswapd vs5, vs3
+; PWR9LE-NEXT:    xsadddp f4, f4, v13
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xsadddp f3, f4, f3
+; PWR9LE-NEXT:    xxswapd vs4, vs2
+; PWR9LE-NEXT:    xsadddp f3, f3, f4
+; PWR9LE-NEXT:    xsadddp f2, f3, f2
+; PWR9LE-NEXT:    xxswapd vs3, vs1
+; PWR9LE-NEXT:    xsadddp f2, f2, f3
+; PWR9LE-NEXT:    xsadddp f1, f2, f1
+; PWR9LE-NEXT:    xxswapd vs2, vs0
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xsadddp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs4, v2
+; PWR9BE-NEXT:    xxswapd vs5, v3
+; PWR9BE-NEXT:    lxv vs3, 240(r1)
+; PWR9BE-NEXT:    lxv vs2, 256(r1)
+; PWR9BE-NEXT:    lxv vs1, 272(r1)
+; PWR9BE-NEXT:    lxv vs0, 288(r1)
+; PWR9BE-NEXT:    xsadddp f4, v2, f4
+; PWR9BE-NEXT:    xsadddp f4, f4, v3
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v4
+; PWR9BE-NEXT:    xsadddp f4, f4, v4
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v5
+; PWR9BE-NEXT:    xsadddp f4, f4, v5
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v6
+; PWR9BE-NEXT:    xsadddp f4, f4, v6
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v7
+; PWR9BE-NEXT:    xsadddp f4, f4, v7
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v8
+; PWR9BE-NEXT:    xsadddp f4, f4, v8
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v9
+; PWR9BE-NEXT:    xsadddp f4, f4, v9
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v10
+; PWR9BE-NEXT:    xsadddp f4, f4, v10
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v11
+; PWR9BE-NEXT:    xsadddp f4, f4, v11
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v12
+; PWR9BE-NEXT:    xsadddp f4, f4, v12
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xxswapd vs5, v13
+; PWR9BE-NEXT:    xsadddp f4, f4, v13
+; PWR9BE-NEXT:    xsadddp f4, f4, f5
+; PWR9BE-NEXT:    xsadddp f4, f4, f3
+; PWR9BE-NEXT:    xxswapd vs3, vs3
+; PWR9BE-NEXT:    xsadddp f3, f4, f3
+; PWR9BE-NEXT:    xsadddp f3, f3, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs2
+; PWR9BE-NEXT:    xsadddp f2, f3, f2
+; PWR9BE-NEXT:    xsadddp f2, f2, f1
+; PWR9BE-NEXT:    xxswapd vs1, vs1
+; PWR9BE-NEXT:    xsadddp f1, f2, f1
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    xxswapd vs0, vs0
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs4, v2
+; PWR10LE-NEXT:    xxswapd vs5, v3
+; PWR10LE-NEXT:    lxv vs3, 224(r1)
+; PWR10LE-NEXT:    lxv vs2, 240(r1)
+; PWR10LE-NEXT:    xsadddp f4, f4, v2
+; PWR10LE-NEXT:    lxv vs1, 256(r1)
+; PWR10LE-NEXT:    lxv vs0, 272(r1)
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v4
+; PWR10LE-NEXT:    xsadddp f4, f4, v3
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v5
+; PWR10LE-NEXT:    xsadddp f4, f4, v4
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v6
+; PWR10LE-NEXT:    xsadddp f4, f4, v5
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v7
+; PWR10LE-NEXT:    xsadddp f4, f4, v6
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v8
+; PWR10LE-NEXT:    xsadddp f4, f4, v7
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v9
+; PWR10LE-NEXT:    xsadddp f4, f4, v8
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v10
+; PWR10LE-NEXT:    xsadddp f4, f4, v9
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v11
+; PWR10LE-NEXT:    xsadddp f4, f4, v10
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v12
+; PWR10LE-NEXT:    xsadddp f4, f4, v11
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, v13
+; PWR10LE-NEXT:    xsadddp f4, f4, v12
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xxswapd vs5, vs3
+; PWR10LE-NEXT:    xsadddp f4, f4, v13
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xsadddp f3, f4, f3
+; PWR10LE-NEXT:    xxswapd vs4, vs2
+; PWR10LE-NEXT:    xsadddp f3, f3, f4
+; PWR10LE-NEXT:    xsadddp f2, f3, f2
+; PWR10LE-NEXT:    xxswapd vs3, vs1
+; PWR10LE-NEXT:    xsadddp f2, f2, f3
+; PWR10LE-NEXT:    xsadddp f1, f2, f1
+; PWR10LE-NEXT:    xxswapd vs2, vs0
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xsadddp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs4, v2
+; PWR10BE-NEXT:    xxswapd vs5, v3
+; PWR10BE-NEXT:    lxv vs3, 240(r1)
+; PWR10BE-NEXT:    lxv vs2, 256(r1)
+; PWR10BE-NEXT:    xsadddp f4, v2, f4
+; PWR10BE-NEXT:    lxv vs1, 272(r1)
+; PWR10BE-NEXT:    lxv vs0, 288(r1)
+; PWR10BE-NEXT:    xsadddp f4, f4, v3
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v4
+; PWR10BE-NEXT:    xsadddp f4, f4, v4
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v5
+; PWR10BE-NEXT:    xsadddp f4, f4, v5
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v6
+; PWR10BE-NEXT:    xsadddp f4, f4, v6
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v7
+; PWR10BE-NEXT:    xsadddp f4, f4, v7
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v8
+; PWR10BE-NEXT:    xsadddp f4, f4, v8
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v9
+; PWR10BE-NEXT:    xsadddp f4, f4, v9
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v10
+; PWR10BE-NEXT:    xsadddp f4, f4, v10
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v11
+; PWR10BE-NEXT:    xsadddp f4, f4, v11
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v12
+; PWR10BE-NEXT:    xsadddp f4, f4, v12
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xxswapd vs5, v13
+; PWR10BE-NEXT:    xsadddp f4, f4, v13
+; PWR10BE-NEXT:    xsadddp f4, f4, f5
+; PWR10BE-NEXT:    xsadddp f4, f4, f3
+; PWR10BE-NEXT:    xxswapd vs3, vs3
+; PWR10BE-NEXT:    xsadddp f3, f4, f3
+; PWR10BE-NEXT:    xsadddp f3, f3, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs2
+; PWR10BE-NEXT:    xsadddp f2, f3, f2
+; PWR10BE-NEXT:    xsadddp f2, f2, f1
+; PWR10BE-NEXT:    xxswapd vs1, vs1
+; PWR10BE-NEXT:    xsadddp f1, f2, f1
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    xxswapd vs0, vs0
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64_b(<32 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs5, v2
+; PWR9LE-NEXT:    lxv vs4, 224(r1)
+; PWR9LE-NEXT:    lxv vs3, 240(r1)
+; PWR9LE-NEXT:    lxv vs2, 256(r1)
+; PWR9LE-NEXT:    lxv vs0, 272(r1)
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v3
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v4
+; PWR9LE-NEXT:    xsadddp f1, f1, v3
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v5
+; PWR9LE-NEXT:    xsadddp f1, f1, v4
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v6
+; PWR9LE-NEXT:    xsadddp f1, f1, v5
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v7
+; PWR9LE-NEXT:    xsadddp f1, f1, v6
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v8
+; PWR9LE-NEXT:    xsadddp f1, f1, v7
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v9
+; PWR9LE-NEXT:    xsadddp f1, f1, v8
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v10
+; PWR9LE-NEXT:    xsadddp f1, f1, v9
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v11
+; PWR9LE-NEXT:    xsadddp f1, f1, v10
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v12
+; PWR9LE-NEXT:    xsadddp f1, f1, v11
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, v13
+; PWR9LE-NEXT:    xsadddp f1, f1, v12
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, vs4
+; PWR9LE-NEXT:    xsadddp f1, f1, v13
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xsadddp f1, f1, f4
+; PWR9LE-NEXT:    xxswapd vs4, vs3
+; PWR9LE-NEXT:    xsadddp f1, f1, f4
+; PWR9LE-NEXT:    xsadddp f1, f1, f3
+; PWR9LE-NEXT:    xxswapd vs3, vs2
+; PWR9LE-NEXT:    xsadddp f1, f1, f3
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xxswapd vs2, vs0
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xsadddp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd vs5, v2
+; PWR9BE-NEXT:    lxv vs4, 240(r1)
+; PWR9BE-NEXT:    lxv vs3, 256(r1)
+; PWR9BE-NEXT:    lxv vs2, 272(r1)
+; PWR9BE-NEXT:    lxv vs0, 288(r1)
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v3
+; PWR9BE-NEXT:    xsadddp f1, f1, v3
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v4
+; PWR9BE-NEXT:    xsadddp f1, f1, v4
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v5
+; PWR9BE-NEXT:    xsadddp f1, f1, v5
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v6
+; PWR9BE-NEXT:    xsadddp f1, f1, v6
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v7
+; PWR9BE-NEXT:    xsadddp f1, f1, v7
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v8
+; PWR9BE-NEXT:    xsadddp f1, f1, v8
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v9
+; PWR9BE-NEXT:    xsadddp f1, f1, v9
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v10
+; PWR9BE-NEXT:    xsadddp f1, f1, v10
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v11
+; PWR9BE-NEXT:    xsadddp f1, f1, v11
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v12
+; PWR9BE-NEXT:    xsadddp f1, f1, v12
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, v13
+; PWR9BE-NEXT:    xsadddp f1, f1, v13
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xsadddp f1, f1, f4
+; PWR9BE-NEXT:    xxswapd vs4, vs4
+; PWR9BE-NEXT:    xsadddp f1, f1, f4
+; PWR9BE-NEXT:    xsadddp f1, f1, f3
+; PWR9BE-NEXT:    xxswapd vs3, vs3
+; PWR9BE-NEXT:    xsadddp f1, f1, f3
+; PWR9BE-NEXT:    xsadddp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs2
+; PWR9BE-NEXT:    xsadddp f1, f1, f2
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    xxswapd vs0, vs0
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs5, v2
+; PWR10LE-NEXT:    lxv vs4, 224(r1)
+; PWR10LE-NEXT:    lxv vs3, 240(r1)
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v3
+; PWR10LE-NEXT:    lxv vs2, 256(r1)
+; PWR10LE-NEXT:    lxv vs0, 272(r1)
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v4
+; PWR10LE-NEXT:    xsadddp f1, f1, v3
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v5
+; PWR10LE-NEXT:    xsadddp f1, f1, v4
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v6
+; PWR10LE-NEXT:    xsadddp f1, f1, v5
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v7
+; PWR10LE-NEXT:    xsadddp f1, f1, v6
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v8
+; PWR10LE-NEXT:    xsadddp f1, f1, v7
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v9
+; PWR10LE-NEXT:    xsadddp f1, f1, v8
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v10
+; PWR10LE-NEXT:    xsadddp f1, f1, v9
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v11
+; PWR10LE-NEXT:    xsadddp f1, f1, v10
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v12
+; PWR10LE-NEXT:    xsadddp f1, f1, v11
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, v13
+; PWR10LE-NEXT:    xsadddp f1, f1, v12
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, vs4
+; PWR10LE-NEXT:    xsadddp f1, f1, v13
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xsadddp f1, f1, f4
+; PWR10LE-NEXT:    xxswapd vs4, vs3
+; PWR10LE-NEXT:    xsadddp f1, f1, f4
+; PWR10LE-NEXT:    xsadddp f1, f1, f3
+; PWR10LE-NEXT:    xxswapd vs3, vs2
+; PWR10LE-NEXT:    xsadddp f1, f1, f3
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xxswapd vs2, vs0
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xsadddp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd vs5, v2
+; PWR10BE-NEXT:    lxv vs4, 240(r1)
+; PWR10BE-NEXT:    lxv vs3, 256(r1)
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v3
+; PWR10BE-NEXT:    lxv vs2, 272(r1)
+; PWR10BE-NEXT:    lxv vs0, 288(r1)
+; PWR10BE-NEXT:    xsadddp f1, f1, v3
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v4
+; PWR10BE-NEXT:    xsadddp f1, f1, v4
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v5
+; PWR10BE-NEXT:    xsadddp f1, f1, v5
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v6
+; PWR10BE-NEXT:    xsadddp f1, f1, v6
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v7
+; PWR10BE-NEXT:    xsadddp f1, f1, v7
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v8
+; PWR10BE-NEXT:    xsadddp f1, f1, v8
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v9
+; PWR10BE-NEXT:    xsadddp f1, f1, v9
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v10
+; PWR10BE-NEXT:    xsadddp f1, f1, v10
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v11
+; PWR10BE-NEXT:    xsadddp f1, f1, v11
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v12
+; PWR10BE-NEXT:    xsadddp f1, f1, v12
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, v13
+; PWR10BE-NEXT:    xsadddp f1, f1, v13
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xsadddp f1, f1, f4
+; PWR10BE-NEXT:    xxswapd vs4, vs4
+; PWR10BE-NEXT:    xsadddp f1, f1, f4
+; PWR10BE-NEXT:    xsadddp f1, f1, f3
+; PWR10BE-NEXT:    xxswapd vs3, vs3
+; PWR10BE-NEXT:    xsadddp f1, f1, f3
+; PWR10BE-NEXT:    xsadddp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs2
+; PWR10BE-NEXT:    xsadddp f1, f1, f2
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    xxswapd vs0, vs0
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v32f64(double %b, <32 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs0, 256(r1)
+; PWR9LE-NEXT:    lxv vs1, 224(r1)
+; PWR9LE-NEXT:    lxv vs2, 272(r1)
+; PWR9LE-NEXT:    lxv vs3, 240(r1)
+; PWR9LE-NEXT:    xvadddp vs4, v3, v11
+; PWR9LE-NEXT:    xvadddp vs5, v5, v13
+; PWR9LE-NEXT:    xvadddp vs6, v2, v10
+; PWR9LE-NEXT:    xvadddp vs7, v4, v12
+; PWR9LE-NEXT:    xvadddp vs3, v7, vs3
+; PWR9LE-NEXT:    xvadddp vs2, v9, vs2
+; PWR9LE-NEXT:    xvadddp vs1, v6, vs1
+; PWR9LE-NEXT:    xvadddp vs0, v8, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs7, vs0
+; PWR9LE-NEXT:    xvadddp vs1, vs6, vs1
+; PWR9LE-NEXT:    xvadddp vs2, vs5, vs2
+; PWR9LE-NEXT:    xvadddp vs3, vs4, vs3
+; PWR9LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs0, 272(r1)
+; PWR9BE-NEXT:    lxv vs1, 240(r1)
+; PWR9BE-NEXT:    lxv vs2, 288(r1)
+; PWR9BE-NEXT:    lxv vs3, 256(r1)
+; PWR9BE-NEXT:    xvadddp vs4, v3, v11
+; PWR9BE-NEXT:    xvadddp vs5, v5, v13
+; PWR9BE-NEXT:    xvadddp vs6, v2, v10
+; PWR9BE-NEXT:    xvadddp vs7, v4, v12
+; PWR9BE-NEXT:    xvadddp vs3, v7, vs3
+; PWR9BE-NEXT:    xvadddp vs2, v9, vs2
+; PWR9BE-NEXT:    xvadddp vs1, v6, vs1
+; PWR9BE-NEXT:    xvadddp vs0, v8, vs0
+; PWR9BE-NEXT:    xvadddp vs0, vs7, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs6, vs1
+; PWR9BE-NEXT:    xvadddp vs2, vs5, vs2
+; PWR9BE-NEXT:    xvadddp vs3, vs4, vs3
+; PWR9BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs0, 256(r1)
+; PWR10LE-NEXT:    lxv vs1, 224(r1)
+; PWR10LE-NEXT:    xvadddp vs4, v3, v11
+; PWR10LE-NEXT:    xvadddp vs5, v5, v13
+; PWR10LE-NEXT:    xvadddp vs6, v2, v10
+; PWR10LE-NEXT:    xvadddp vs7, v4, v12
+; PWR10LE-NEXT:    xvadddp vs1, v6, vs1
+; PWR10LE-NEXT:    lxv vs2, 272(r1)
+; PWR10LE-NEXT:    lxv vs3, 240(r1)
+; PWR10LE-NEXT:    xvadddp vs3, v7, vs3
+; PWR10LE-NEXT:    xvadddp vs2, v9, vs2
+; PWR10LE-NEXT:    xvadddp vs0, v8, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs7, vs0
+; PWR10LE-NEXT:    xvadddp vs1, vs6, vs1
+; PWR10LE-NEXT:    xvadddp vs2, vs5, vs2
+; PWR10LE-NEXT:    xvadddp vs3, vs4, vs3
+; PWR10LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs0, 272(r1)
+; PWR10BE-NEXT:    lxv vs1, 240(r1)
+; PWR10BE-NEXT:    xvadddp vs4, v3, v11
+; PWR10BE-NEXT:    xvadddp vs5, v5, v13
+; PWR10BE-NEXT:    xvadddp vs6, v2, v10
+; PWR10BE-NEXT:    xvadddp vs7, v4, v12
+; PWR10BE-NEXT:    xvadddp vs1, v6, vs1
+; PWR10BE-NEXT:    lxv vs2, 288(r1)
+; PWR10BE-NEXT:    lxv vs3, 256(r1)
+; PWR10BE-NEXT:    xvadddp vs3, v7, vs3
+; PWR10BE-NEXT:    xvadddp vs2, v9, vs2
+; PWR10BE-NEXT:    xvadddp vs0, v8, vs0
+; PWR10BE-NEXT:    xvadddp vs0, vs7, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs6, vs1
+; PWR10BE-NEXT:    xvadddp vs2, vs5, vs2
+; PWR10BE-NEXT:    xvadddp vs3, vs4, vs3
+; PWR10BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v64f64(<64 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v64f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v18, v2
+; PWR9LE-NEXT:    lxv v17, 224(r1)
+; PWR9LE-NEXT:    lxv v16, 240(r1)
+; PWR9LE-NEXT:    lxv v15, 256(r1)
+; PWR9LE-NEXT:    lxv v14, 272(r1)
+; PWR9LE-NEXT:    xsadddp v2, v18, v2
+; PWR9LE-NEXT:    xxswapd v18, v3
+; PWR9LE-NEXT:    lxv v1, 288(r1)
+; PWR9LE-NEXT:    lxv v0, 304(r1)
+; PWR9LE-NEXT:    lxv vs13, 320(r1)
+; PWR9LE-NEXT:    lxv vs12, 336(r1)
+; PWR9LE-NEXT:    lxv vs11, 352(r1)
+; PWR9LE-NEXT:    lxv vs10, 368(r1)
+; PWR9LE-NEXT:    lxv vs9, 384(r1)
+; PWR9LE-NEXT:    lxv vs8, 400(r1)
+; PWR9LE-NEXT:    lxv vs7, 416(r1)
+; PWR9LE-NEXT:    lxv vs6, 432(r1)
+; PWR9LE-NEXT:    lxv vs5, 448(r1)
+; PWR9LE-NEXT:    lxv vs4, 464(r1)
+; PWR9LE-NEXT:    xsadddp v2, v2, v18
+; PWR9LE-NEXT:    lxv vs3, 480(r1)
+; PWR9LE-NEXT:    lxv vs2, 496(r1)
+; PWR9LE-NEXT:    lxv vs1, 512(r1)
+; PWR9LE-NEXT:    lxv vs0, 528(r1)
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v4
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v5
+; PWR9LE-NEXT:    xsadddp v2, v2, v4
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v6
+; PWR9LE-NEXT:    xsadddp v2, v2, v5
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v7
+; PWR9LE-NEXT:    xsadddp v2, v2, v6
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v8
+; PWR9LE-NEXT:    xsadddp v2, v2, v7
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v9
+; PWR9LE-NEXT:    xsadddp v2, v2, v8
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v10
+; PWR9LE-NEXT:    xsadddp v2, v2, v9
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v11
+; PWR9LE-NEXT:    xsadddp v2, v2, v10
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v12
+; PWR9LE-NEXT:    xsadddp v2, v2, v11
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v13
+; PWR9LE-NEXT:    xsadddp v2, v2, v12
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v17
+; PWR9LE-NEXT:    xsadddp v2, v2, v13
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v16
+; PWR9LE-NEXT:    xsadddp v2, v2, v17
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v15
+; PWR9LE-NEXT:    xsadddp v2, v2, v16
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v14
+; PWR9LE-NEXT:    xsadddp v2, v2, v15
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v1
+; PWR9LE-NEXT:    xsadddp v2, v2, v14
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v0
+; PWR9LE-NEXT:    xsadddp v2, v2, v1
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, vs13
+; PWR9LE-NEXT:    xsadddp v2, v2, v0
+; PWR9LE-NEXT:    xsadddp v2, v2, v3
+; PWR9LE-NEXT:    xsadddp f13, v2, f13
+; PWR9LE-NEXT:    xxswapd v2, vs12
+; PWR9LE-NEXT:    xsadddp f13, f13, v2
+; PWR9LE-NEXT:    xsadddp f12, f13, f12
+; PWR9LE-NEXT:    xxswapd vs13, vs11
+; PWR9LE-NEXT:    xsadddp f12, f12, f13
+; PWR9LE-NEXT:    xsadddp f11, f12, f11
+; PWR9LE-NEXT:    xxswapd vs12, vs10
+; PWR9LE-NEXT:    xsadddp f11, f11, f12
+; PWR9LE-NEXT:    xsadddp f10, f11, f10
+; PWR9LE-NEXT:    xxswapd vs11, vs9
+; PWR9LE-NEXT:    xsadddp f10, f10, f11
+; PWR9LE-NEXT:    xsadddp f9, f10, f9
+; PWR9LE-NEXT:    xxswapd vs10, vs8
+; PWR9LE-NEXT:    xsadddp f9, f9, f10
+; PWR9LE-NEXT:    xsadddp f8, f9, f8
+; PWR9LE-NEXT:    xxswapd vs9, vs7
+; PWR9LE-NEXT:    xsadddp f8, f8, f9
+; PWR9LE-NEXT:    xsadddp f7, f8, f7
+; PWR9LE-NEXT:    xxswapd vs8, vs6
+; PWR9LE-NEXT:    xsadddp f7, f7, f8
+; PWR9LE-NEXT:    xsadddp f6, f7, f6
+; PWR9LE-NEXT:    xxswapd vs7, vs5
+; PWR9LE-NEXT:    xsadddp f6, f6, f7
+; PWR9LE-NEXT:    xsadddp f5, f6, f5
+; PWR9LE-NEXT:    xxswapd vs6, vs4
+; PWR9LE-NEXT:    xsadddp f5, f5, f6
+; PWR9LE-NEXT:    xsadddp f4, f5, f4
+; PWR9LE-NEXT:    xxswapd vs5, vs3
+; PWR9LE-NEXT:    xsadddp f4, f4, f5
+; PWR9LE-NEXT:    xsadddp f3, f4, f3
+; PWR9LE-NEXT:    xxswapd vs4, vs2
+; PWR9LE-NEXT:    xsadddp f3, f3, f4
+; PWR9LE-NEXT:    xsadddp f2, f3, f2
+; PWR9LE-NEXT:    xxswapd vs3, vs1
+; PWR9LE-NEXT:    xsadddp f2, f2, f3
+; PWR9LE-NEXT:    xsadddp f1, f2, f1
+; PWR9LE-NEXT:    xxswapd vs2, vs0
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xsadddp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v64f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v18, v2
+; PWR9BE-NEXT:    lxv v17, 240(r1)
+; PWR9BE-NEXT:    lxv v16, 256(r1)
+; PWR9BE-NEXT:    lxv v15, 272(r1)
+; PWR9BE-NEXT:    lxv v14, 288(r1)
+; PWR9BE-NEXT:    xsadddp v2, v2, v18
+; PWR9BE-NEXT:    lxv v1, 304(r1)
+; PWR9BE-NEXT:    lxv v0, 320(r1)
+; PWR9BE-NEXT:    lxv vs13, 336(r1)
+; PWR9BE-NEXT:    lxv vs12, 352(r1)
+; PWR9BE-NEXT:    lxv vs11, 368(r1)
+; PWR9BE-NEXT:    lxv vs10, 384(r1)
+; PWR9BE-NEXT:    lxv vs9, 400(r1)
+; PWR9BE-NEXT:    lxv vs8, 416(r1)
+; PWR9BE-NEXT:    lxv vs7, 432(r1)
+; PWR9BE-NEXT:    lxv vs6, 448(r1)
+; PWR9BE-NEXT:    lxv vs5, 464(r1)
+; PWR9BE-NEXT:    lxv vs4, 480(r1)
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v3
+; PWR9BE-NEXT:    lxv vs3, 496(r1)
+; PWR9BE-NEXT:    lxv vs2, 512(r1)
+; PWR9BE-NEXT:    lxv vs1, 528(r1)
+; PWR9BE-NEXT:    lxv vs0, 544(r1)
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v4
+; PWR9BE-NEXT:    xsadddp v2, v2, v4
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v5
+; PWR9BE-NEXT:    xsadddp v2, v2, v5
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v6
+; PWR9BE-NEXT:    xsadddp v2, v2, v6
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v7
+; PWR9BE-NEXT:    xsadddp v2, v2, v7
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v8
+; PWR9BE-NEXT:    xsadddp v2, v2, v8
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v9
+; PWR9BE-NEXT:    xsadddp v2, v2, v9
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v10
+; PWR9BE-NEXT:    xsadddp v2, v2, v10
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v11
+; PWR9BE-NEXT:    xsadddp v2, v2, v11
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v12
+; PWR9BE-NEXT:    xsadddp v2, v2, v12
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v13
+; PWR9BE-NEXT:    xsadddp v2, v2, v13
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v17
+; PWR9BE-NEXT:    xsadddp v2, v2, v17
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v16
+; PWR9BE-NEXT:    xsadddp v2, v2, v16
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v15
+; PWR9BE-NEXT:    xsadddp v2, v2, v15
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v14
+; PWR9BE-NEXT:    xsadddp v2, v2, v14
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v1
+; PWR9BE-NEXT:    xsadddp v2, v2, v1
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v0
+; PWR9BE-NEXT:    xsadddp v2, v2, v0
+; PWR9BE-NEXT:    xsadddp v2, v2, v3
+; PWR9BE-NEXT:    xsadddp v2, v2, f13
+; PWR9BE-NEXT:    xxswapd vs13, vs13
+; PWR9BE-NEXT:    xsadddp f13, v2, f13
+; PWR9BE-NEXT:    xsadddp f13, f13, f12
+; PWR9BE-NEXT:    xxswapd vs12, vs12
+; PWR9BE-NEXT:    xsadddp f12, f13, f12
+; PWR9BE-NEXT:    xsadddp f12, f12, f11
+; PWR9BE-NEXT:    xxswapd vs11, vs11
+; PWR9BE-NEXT:    xsadddp f11, f12, f11
+; PWR9BE-NEXT:    xsadddp f11, f11, f10
+; PWR9BE-NEXT:    xxswapd vs10, vs10
+; PWR9BE-NEXT:    xsadddp f10, f11, f10
+; PWR9BE-NEXT:    xsadddp f10, f10, f9
+; PWR9BE-NEXT:    xxswapd vs9, vs9
+; PWR9BE-NEXT:    xsadddp f9, f10, f9
+; PWR9BE-NEXT:    xsadddp f9, f9, f8
+; PWR9BE-NEXT:    xxswapd vs8, vs8
+; PWR9BE-NEXT:    xsadddp f8, f9, f8
+; PWR9BE-NEXT:    xsadddp f8, f8, f7
+; PWR9BE-NEXT:    xxswapd vs7, vs7
+; PWR9BE-NEXT:    xsadddp f7, f8, f7
+; PWR9BE-NEXT:    xsadddp f7, f7, f6
+; PWR9BE-NEXT:    xxswapd vs6, vs6
+; PWR9BE-NEXT:    xsadddp f6, f7, f6
+; PWR9BE-NEXT:    xsadddp f6, f6, f5
+; PWR9BE-NEXT:    xxswapd vs5, vs5
+; PWR9BE-NEXT:    xsadddp f5, f6, f5
+; PWR9BE-NEXT:    xsadddp f5, f5, f4
+; PWR9BE-NEXT:    xxswapd vs4, vs4
+; PWR9BE-NEXT:    xsadddp f4, f5, f4
+; PWR9BE-NEXT:    xsadddp f4, f4, f3
+; PWR9BE-NEXT:    xxswapd vs3, vs3
+; PWR9BE-NEXT:    xsadddp f3, f4, f3
+; PWR9BE-NEXT:    xsadddp f3, f3, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs2
+; PWR9BE-NEXT:    xsadddp f2, f3, f2
+; PWR9BE-NEXT:    xsadddp f2, f2, f1
+; PWR9BE-NEXT:    xxswapd vs1, vs1
+; PWR9BE-NEXT:    xsadddp f1, f2, f1
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    xxswapd vs0, vs0
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v64f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v18, v2
+; PWR10LE-NEXT:    lxv v17, 224(r1)
+; PWR10LE-NEXT:    lxv v16, 240(r1)
+; PWR10LE-NEXT:    xsadddp v2, v18, v2
+; PWR10LE-NEXT:    xxswapd v18, v3
+; PWR10LE-NEXT:    lxv v15, 256(r1)
+; PWR10LE-NEXT:    lxv v14, 272(r1)
+; PWR10LE-NEXT:    lxv v1, 288(r1)
+; PWR10LE-NEXT:    lxv v0, 304(r1)
+; PWR10LE-NEXT:    lxv vs13, 320(r1)
+; PWR10LE-NEXT:    lxv vs12, 336(r1)
+; PWR10LE-NEXT:    lxv vs11, 352(r1)
+; PWR10LE-NEXT:    lxv vs10, 368(r1)
+; PWR10LE-NEXT:    xsadddp v2, v2, v18
+; PWR10LE-NEXT:    lxv vs9, 384(r1)
+; PWR10LE-NEXT:    lxv vs8, 400(r1)
+; PWR10LE-NEXT:    lxv vs7, 416(r1)
+; PWR10LE-NEXT:    lxv vs6, 432(r1)
+; PWR10LE-NEXT:    lxv vs5, 448(r1)
+; PWR10LE-NEXT:    lxv vs4, 464(r1)
+; PWR10LE-NEXT:    lxv vs3, 480(r1)
+; PWR10LE-NEXT:    lxv vs2, 496(r1)
+; PWR10LE-NEXT:    lxv vs1, 512(r1)
+; PWR10LE-NEXT:    lxv vs0, 528(r1)
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v4
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v5
+; PWR10LE-NEXT:    xsadddp v2, v2, v4
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v6
+; PWR10LE-NEXT:    xsadddp v2, v2, v5
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v7
+; PWR10LE-NEXT:    xsadddp v2, v2, v6
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v8
+; PWR10LE-NEXT:    xsadddp v2, v2, v7
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v9
+; PWR10LE-NEXT:    xsadddp v2, v2, v8
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v10
+; PWR10LE-NEXT:    xsadddp v2, v2, v9
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v11
+; PWR10LE-NEXT:    xsadddp v2, v2, v10
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v12
+; PWR10LE-NEXT:    xsadddp v2, v2, v11
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v13
+; PWR10LE-NEXT:    xsadddp v2, v2, v12
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v17
+; PWR10LE-NEXT:    xsadddp v2, v2, v13
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v16
+; PWR10LE-NEXT:    xsadddp v2, v2, v17
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v15
+; PWR10LE-NEXT:    xsadddp v2, v2, v16
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v14
+; PWR10LE-NEXT:    xsadddp v2, v2, v15
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v1
+; PWR10LE-NEXT:    xsadddp v2, v2, v14
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v0
+; PWR10LE-NEXT:    xsadddp v2, v2, v1
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, vs13
+; PWR10LE-NEXT:    xsadddp v2, v2, v0
+; PWR10LE-NEXT:    xsadddp v2, v2, v3
+; PWR10LE-NEXT:    xsadddp f13, v2, f13
+; PWR10LE-NEXT:    xxswapd v2, vs12
+; PWR10LE-NEXT:    xsadddp f13, f13, v2
+; PWR10LE-NEXT:    xsadddp f12, f13, f12
+; PWR10LE-NEXT:    xxswapd vs13, vs11
+; PWR10LE-NEXT:    xsadddp f12, f12, f13
+; PWR10LE-NEXT:    xsadddp f11, f12, f11
+; PWR10LE-NEXT:    xxswapd vs12, vs10
+; PWR10LE-NEXT:    xsadddp f11, f11, f12
+; PWR10LE-NEXT:    xsadddp f10, f11, f10
+; PWR10LE-NEXT:    xxswapd vs11, vs9
+; PWR10LE-NEXT:    xsadddp f10, f10, f11
+; PWR10LE-NEXT:    xsadddp f9, f10, f9
+; PWR10LE-NEXT:    xxswapd vs10, vs8
+; PWR10LE-NEXT:    xsadddp f9, f9, f10
+; PWR10LE-NEXT:    xsadddp f8, f9, f8
+; PWR10LE-NEXT:    xxswapd vs9, vs7
+; PWR10LE-NEXT:    xsadddp f8, f8, f9
+; PWR10LE-NEXT:    xsadddp f7, f8, f7
+; PWR10LE-NEXT:    xxswapd vs8, vs6
+; PWR10LE-NEXT:    xsadddp f7, f7, f8
+; PWR10LE-NEXT:    xsadddp f6, f7, f6
+; PWR10LE-NEXT:    xxswapd vs7, vs5
+; PWR10LE-NEXT:    xsadddp f6, f6, f7
+; PWR10LE-NEXT:    xsadddp f5, f6, f5
+; PWR10LE-NEXT:    xxswapd vs6, vs4
+; PWR10LE-NEXT:    xsadddp f5, f5, f6
+; PWR10LE-NEXT:    xsadddp f4, f5, f4
+; PWR10LE-NEXT:    xxswapd vs5, vs3
+; PWR10LE-NEXT:    xsadddp f4, f4, f5
+; PWR10LE-NEXT:    xsadddp f3, f4, f3
+; PWR10LE-NEXT:    xxswapd vs4, vs2
+; PWR10LE-NEXT:    xsadddp f3, f3, f4
+; PWR10LE-NEXT:    xsadddp f2, f3, f2
+; PWR10LE-NEXT:    xxswapd vs3, vs1
+; PWR10LE-NEXT:    xsadddp f2, f2, f3
+; PWR10LE-NEXT:    xsadddp f1, f2, f1
+; PWR10LE-NEXT:    xxswapd vs2, vs0
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xsadddp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v64f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v18, v2
+; PWR10BE-NEXT:    lxv v17, 240(r1)
+; PWR10BE-NEXT:    lxv v16, 256(r1)
+; PWR10BE-NEXT:    xsadddp v2, v2, v18
+; PWR10BE-NEXT:    lxv v15, 272(r1)
+; PWR10BE-NEXT:    lxv v14, 288(r1)
+; PWR10BE-NEXT:    lxv v1, 304(r1)
+; PWR10BE-NEXT:    lxv v0, 320(r1)
+; PWR10BE-NEXT:    lxv vs13, 336(r1)
+; PWR10BE-NEXT:    lxv vs12, 352(r1)
+; PWR10BE-NEXT:    lxv vs11, 368(r1)
+; PWR10BE-NEXT:    lxv vs10, 384(r1)
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v3
+; PWR10BE-NEXT:    lxv vs9, 400(r1)
+; PWR10BE-NEXT:    lxv vs8, 416(r1)
+; PWR10BE-NEXT:    lxv vs7, 432(r1)
+; PWR10BE-NEXT:    lxv vs6, 448(r1)
+; PWR10BE-NEXT:    lxv vs5, 464(r1)
+; PWR10BE-NEXT:    lxv vs4, 480(r1)
+; PWR10BE-NEXT:    lxv vs3, 496(r1)
+; PWR10BE-NEXT:    lxv vs2, 512(r1)
+; PWR10BE-NEXT:    lxv vs1, 528(r1)
+; PWR10BE-NEXT:    lxv vs0, 544(r1)
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v4
+; PWR10BE-NEXT:    xsadddp v2, v2, v4
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v5
+; PWR10BE-NEXT:    xsadddp v2, v2, v5
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v6
+; PWR10BE-NEXT:    xsadddp v2, v2, v6
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v7
+; PWR10BE-NEXT:    xsadddp v2, v2, v7
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v8
+; PWR10BE-NEXT:    xsadddp v2, v2, v8
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v9
+; PWR10BE-NEXT:    xsadddp v2, v2, v9
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v10
+; PWR10BE-NEXT:    xsadddp v2, v2, v10
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v11
+; PWR10BE-NEXT:    xsadddp v2, v2, v11
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v12
+; PWR10BE-NEXT:    xsadddp v2, v2, v12
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v13
+; PWR10BE-NEXT:    xsadddp v2, v2, v13
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v17
+; PWR10BE-NEXT:    xsadddp v2, v2, v17
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v16
+; PWR10BE-NEXT:    xsadddp v2, v2, v16
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v15
+; PWR10BE-NEXT:    xsadddp v2, v2, v15
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v14
+; PWR10BE-NEXT:    xsadddp v2, v2, v14
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v1
+; PWR10BE-NEXT:    xsadddp v2, v2, v1
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v0
+; PWR10BE-NEXT:    xsadddp v2, v2, v0
+; PWR10BE-NEXT:    xsadddp v2, v2, v3
+; PWR10BE-NEXT:    xsadddp v2, v2, f13
+; PWR10BE-NEXT:    xxswapd vs13, vs13
+; PWR10BE-NEXT:    xsadddp f13, v2, f13
+; PWR10BE-NEXT:    xsadddp f13, f13, f12
+; PWR10BE-NEXT:    xxswapd vs12, vs12
+; PWR10BE-NEXT:    xsadddp f12, f13, f12
+; PWR10BE-NEXT:    xsadddp f12, f12, f11
+; PWR10BE-NEXT:    xxswapd vs11, vs11
+; PWR10BE-NEXT:    xsadddp f11, f12, f11
+; PWR10BE-NEXT:    xsadddp f11, f11, f10
+; PWR10BE-NEXT:    xxswapd vs10, vs10
+; PWR10BE-NEXT:    xsadddp f10, f11, f10
+; PWR10BE-NEXT:    xsadddp f10, f10, f9
+; PWR10BE-NEXT:    xxswapd vs9, vs9
+; PWR10BE-NEXT:    xsadddp f9, f10, f9
+; PWR10BE-NEXT:    xsadddp f9, f9, f8
+; PWR10BE-NEXT:    xxswapd vs8, vs8
+; PWR10BE-NEXT:    xsadddp f8, f9, f8
+; PWR10BE-NEXT:    xsadddp f8, f8, f7
+; PWR10BE-NEXT:    xxswapd vs7, vs7
+; PWR10BE-NEXT:    xsadddp f7, f8, f7
+; PWR10BE-NEXT:    xsadddp f7, f7, f6
+; PWR10BE-NEXT:    xxswapd vs6, vs6
+; PWR10BE-NEXT:    xsadddp f6, f7, f6
+; PWR10BE-NEXT:    xsadddp f6, f6, f5
+; PWR10BE-NEXT:    xxswapd vs5, vs5
+; PWR10BE-NEXT:    xsadddp f5, f6, f5
+; PWR10BE-NEXT:    xsadddp f5, f5, f4
+; PWR10BE-NEXT:    xxswapd vs4, vs4
+; PWR10BE-NEXT:    xsadddp f4, f5, f4
+; PWR10BE-NEXT:    xsadddp f4, f4, f3
+; PWR10BE-NEXT:    xxswapd vs3, vs3
+; PWR10BE-NEXT:    xsadddp f3, f4, f3
+; PWR10BE-NEXT:    xsadddp f3, f3, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs2
+; PWR10BE-NEXT:    xsadddp f2, f3, f2
+; PWR10BE-NEXT:    xsadddp f2, f2, f1
+; PWR10BE-NEXT:    xxswapd vs1, vs1
+; PWR10BE-NEXT:    xsadddp f1, f2, f1
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    xxswapd vs0, vs0
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v64f64_b(<64 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v64f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v19, v2
+; PWR9LE-NEXT:    lxv v18, 224(r1)
+; PWR9LE-NEXT:    lxv v17, 240(r1)
+; PWR9LE-NEXT:    lxv v16, 256(r1)
+; PWR9LE-NEXT:    lxv v15, 272(r1)
+; PWR9LE-NEXT:    xsadddp f1, f1, v19
+; PWR9LE-NEXT:    lxv v14, 288(r1)
+; PWR9LE-NEXT:    lxv v1, 304(r1)
+; PWR9LE-NEXT:    lxv v0, 320(r1)
+; PWR9LE-NEXT:    lxv vs13, 336(r1)
+; PWR9LE-NEXT:    lxv vs12, 352(r1)
+; PWR9LE-NEXT:    lxv vs11, 368(r1)
+; PWR9LE-NEXT:    lxv vs10, 384(r1)
+; PWR9LE-NEXT:    lxv vs9, 400(r1)
+; PWR9LE-NEXT:    lxv vs8, 416(r1)
+; PWR9LE-NEXT:    lxv vs7, 432(r1)
+; PWR9LE-NEXT:    lxv vs6, 448(r1)
+; PWR9LE-NEXT:    lxv vs5, 464(r1)
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v3
+; PWR9LE-NEXT:    lxv vs4, 480(r1)
+; PWR9LE-NEXT:    lxv vs3, 496(r1)
+; PWR9LE-NEXT:    lxv vs2, 512(r1)
+; PWR9LE-NEXT:    lxv vs0, 528(r1)
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v4
+; PWR9LE-NEXT:    xsadddp f1, f1, v3
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v5
+; PWR9LE-NEXT:    xsadddp f1, f1, v4
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v6
+; PWR9LE-NEXT:    xsadddp f1, f1, v5
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v7
+; PWR9LE-NEXT:    xsadddp f1, f1, v6
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v8
+; PWR9LE-NEXT:    xsadddp f1, f1, v7
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v9
+; PWR9LE-NEXT:    xsadddp f1, f1, v8
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v10
+; PWR9LE-NEXT:    xsadddp f1, f1, v9
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v11
+; PWR9LE-NEXT:    xsadddp f1, f1, v10
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v12
+; PWR9LE-NEXT:    xsadddp f1, f1, v11
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v13
+; PWR9LE-NEXT:    xsadddp f1, f1, v12
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v18
+; PWR9LE-NEXT:    xsadddp f1, f1, v13
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v17
+; PWR9LE-NEXT:    xsadddp f1, f1, v18
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v16
+; PWR9LE-NEXT:    xsadddp f1, f1, v17
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v15
+; PWR9LE-NEXT:    xsadddp f1, f1, v16
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v14
+; PWR9LE-NEXT:    xsadddp f1, f1, v15
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v1
+; PWR9LE-NEXT:    xsadddp f1, f1, v14
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, v0
+; PWR9LE-NEXT:    xsadddp f1, f1, v1
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xxswapd v2, vs13
+; PWR9LE-NEXT:    xsadddp f1, f1, v0
+; PWR9LE-NEXT:    xsadddp f1, f1, v2
+; PWR9LE-NEXT:    xsadddp f1, f1, f13
+; PWR9LE-NEXT:    xxswapd vs13, vs12
+; PWR9LE-NEXT:    xsadddp f1, f1, f13
+; PWR9LE-NEXT:    xsadddp f1, f1, f12
+; PWR9LE-NEXT:    xxswapd vs12, vs11
+; PWR9LE-NEXT:    xsadddp f1, f1, f12
+; PWR9LE-NEXT:    xsadddp f1, f1, f11
+; PWR9LE-NEXT:    xxswapd vs11, vs10
+; PWR9LE-NEXT:    xsadddp f1, f1, f11
+; PWR9LE-NEXT:    xsadddp f1, f1, f10
+; PWR9LE-NEXT:    xxswapd vs10, vs9
+; PWR9LE-NEXT:    xsadddp f1, f1, f10
+; PWR9LE-NEXT:    xsadddp f1, f1, f9
+; PWR9LE-NEXT:    xxswapd vs9, vs8
+; PWR9LE-NEXT:    xsadddp f1, f1, f9
+; PWR9LE-NEXT:    xsadddp f1, f1, f8
+; PWR9LE-NEXT:    xxswapd vs8, vs7
+; PWR9LE-NEXT:    xsadddp f1, f1, f8
+; PWR9LE-NEXT:    xsadddp f1, f1, f7
+; PWR9LE-NEXT:    xxswapd vs7, vs6
+; PWR9LE-NEXT:    xsadddp f1, f1, f7
+; PWR9LE-NEXT:    xsadddp f1, f1, f6
+; PWR9LE-NEXT:    xxswapd vs6, vs5
+; PWR9LE-NEXT:    xsadddp f1, f1, f6
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xxswapd vs5, vs4
+; PWR9LE-NEXT:    xsadddp f1, f1, f5
+; PWR9LE-NEXT:    xsadddp f1, f1, f4
+; PWR9LE-NEXT:    xxswapd vs4, vs3
+; PWR9LE-NEXT:    xsadddp f1, f1, f4
+; PWR9LE-NEXT:    xsadddp f1, f1, f3
+; PWR9LE-NEXT:    xxswapd vs3, vs2
+; PWR9LE-NEXT:    xsadddp f1, f1, f3
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xxswapd vs2, vs0
+; PWR9LE-NEXT:    xsadddp f1, f1, f2
+; PWR9LE-NEXT:    xsadddp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v64f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v2
+; PWR9BE-NEXT:    lxv v18, 240(r1)
+; PWR9BE-NEXT:    lxv v17, 256(r1)
+; PWR9BE-NEXT:    lxv v16, 272(r1)
+; PWR9BE-NEXT:    lxv v15, 288(r1)
+; PWR9BE-NEXT:    lxv v14, 304(r1)
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v3
+; PWR9BE-NEXT:    lxv v1, 320(r1)
+; PWR9BE-NEXT:    lxv v0, 336(r1)
+; PWR9BE-NEXT:    lxv vs13, 352(r1)
+; PWR9BE-NEXT:    lxv vs12, 368(r1)
+; PWR9BE-NEXT:    lxv vs11, 384(r1)
+; PWR9BE-NEXT:    lxv vs10, 400(r1)
+; PWR9BE-NEXT:    lxv vs9, 416(r1)
+; PWR9BE-NEXT:    lxv vs8, 432(r1)
+; PWR9BE-NEXT:    lxv vs7, 448(r1)
+; PWR9BE-NEXT:    lxv vs6, 464(r1)
+; PWR9BE-NEXT:    lxv vs5, 480(r1)
+; PWR9BE-NEXT:    lxv vs4, 496(r1)
+; PWR9BE-NEXT:    lxv vs3, 512(r1)
+; PWR9BE-NEXT:    lxv vs2, 528(r1)
+; PWR9BE-NEXT:    lxv vs0, 544(r1)
+; PWR9BE-NEXT:    xsadddp f1, f1, v3
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v4
+; PWR9BE-NEXT:    xsadddp f1, f1, v4
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v5
+; PWR9BE-NEXT:    xsadddp f1, f1, v5
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v6
+; PWR9BE-NEXT:    xsadddp f1, f1, v6
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v7
+; PWR9BE-NEXT:    xsadddp f1, f1, v7
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v8
+; PWR9BE-NEXT:    xsadddp f1, f1, v8
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v9
+; PWR9BE-NEXT:    xsadddp f1, f1, v9
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v10
+; PWR9BE-NEXT:    xsadddp f1, f1, v10
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v11
+; PWR9BE-NEXT:    xsadddp f1, f1, v11
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v12
+; PWR9BE-NEXT:    xsadddp f1, f1, v12
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v13
+; PWR9BE-NEXT:    xsadddp f1, f1, v13
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v18
+; PWR9BE-NEXT:    xsadddp f1, f1, v18
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v17
+; PWR9BE-NEXT:    xsadddp f1, f1, v17
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v16
+; PWR9BE-NEXT:    xsadddp f1, f1, v16
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v15
+; PWR9BE-NEXT:    xsadddp f1, f1, v15
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v14
+; PWR9BE-NEXT:    xsadddp f1, f1, v14
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v1
+; PWR9BE-NEXT:    xsadddp f1, f1, v1
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xxswapd v2, v0
+; PWR9BE-NEXT:    xsadddp f1, f1, v0
+; PWR9BE-NEXT:    xsadddp f1, f1, v2
+; PWR9BE-NEXT:    xsadddp f1, f1, f13
+; PWR9BE-NEXT:    xxswapd vs13, vs13
+; PWR9BE-NEXT:    xsadddp f1, f1, f13
+; PWR9BE-NEXT:    xsadddp f1, f1, f12
+; PWR9BE-NEXT:    xxswapd vs12, vs12
+; PWR9BE-NEXT:    xsadddp f1, f1, f12
+; PWR9BE-NEXT:    xsadddp f1, f1, f11
+; PWR9BE-NEXT:    xxswapd vs11, vs11
+; PWR9BE-NEXT:    xsadddp f1, f1, f11
+; PWR9BE-NEXT:    xsadddp f1, f1, f10
+; PWR9BE-NEXT:    xxswapd vs10, vs10
+; PWR9BE-NEXT:    xsadddp f1, f1, f10
+; PWR9BE-NEXT:    xsadddp f1, f1, f9
+; PWR9BE-NEXT:    xxswapd vs9, vs9
+; PWR9BE-NEXT:    xsadddp f1, f1, f9
+; PWR9BE-NEXT:    xsadddp f1, f1, f8
+; PWR9BE-NEXT:    xxswapd vs8, vs8
+; PWR9BE-NEXT:    xsadddp f1, f1, f8
+; PWR9BE-NEXT:    xsadddp f1, f1, f7
+; PWR9BE-NEXT:    xxswapd vs7, vs7
+; PWR9BE-NEXT:    xsadddp f1, f1, f7
+; PWR9BE-NEXT:    xsadddp f1, f1, f6
+; PWR9BE-NEXT:    xxswapd vs6, vs6
+; PWR9BE-NEXT:    xsadddp f1, f1, f6
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xxswapd vs5, vs5
+; PWR9BE-NEXT:    xsadddp f1, f1, f5
+; PWR9BE-NEXT:    xsadddp f1, f1, f4
+; PWR9BE-NEXT:    xxswapd vs4, vs4
+; PWR9BE-NEXT:    xsadddp f1, f1, f4
+; PWR9BE-NEXT:    xsadddp f1, f1, f3
+; PWR9BE-NEXT:    xxswapd vs3, vs3
+; PWR9BE-NEXT:    xsadddp f1, f1, f3
+; PWR9BE-NEXT:    xsadddp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs2
+; PWR9BE-NEXT:    xsadddp f1, f1, f2
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    xxswapd vs0, vs0
+; PWR9BE-NEXT:    xsadddp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v64f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v19, v2
+; PWR10LE-NEXT:    lxv v18, 224(r1)
+; PWR10LE-NEXT:    lxv v17, 240(r1)
+; PWR10LE-NEXT:    xsadddp f1, f1, v19
+; PWR10LE-NEXT:    lxv v16, 256(r1)
+; PWR10LE-NEXT:    lxv v15, 272(r1)
+; PWR10LE-NEXT:    lxv v14, 288(r1)
+; PWR10LE-NEXT:    lxv v1, 304(r1)
+; PWR10LE-NEXT:    lxv v0, 320(r1)
+; PWR10LE-NEXT:    lxv vs13, 336(r1)
+; PWR10LE-NEXT:    lxv vs12, 352(r1)
+; PWR10LE-NEXT:    lxv vs11, 368(r1)
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v3
+; PWR10LE-NEXT:    lxv vs10, 384(r1)
+; PWR10LE-NEXT:    lxv vs9, 400(r1)
+; PWR10LE-NEXT:    lxv vs8, 416(r1)
+; PWR10LE-NEXT:    lxv vs7, 432(r1)
+; PWR10LE-NEXT:    lxv vs6, 448(r1)
+; PWR10LE-NEXT:    lxv vs5, 464(r1)
+; PWR10LE-NEXT:    lxv vs4, 480(r1)
+; PWR10LE-NEXT:    lxv vs3, 496(r1)
+; PWR10LE-NEXT:    lxv vs2, 512(r1)
+; PWR10LE-NEXT:    lxv vs0, 528(r1)
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v4
+; PWR10LE-NEXT:    xsadddp f1, f1, v3
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v5
+; PWR10LE-NEXT:    xsadddp f1, f1, v4
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v6
+; PWR10LE-NEXT:    xsadddp f1, f1, v5
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v7
+; PWR10LE-NEXT:    xsadddp f1, f1, v6
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v8
+; PWR10LE-NEXT:    xsadddp f1, f1, v7
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v9
+; PWR10LE-NEXT:    xsadddp f1, f1, v8
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v10
+; PWR10LE-NEXT:    xsadddp f1, f1, v9
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v11
+; PWR10LE-NEXT:    xsadddp f1, f1, v10
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v12
+; PWR10LE-NEXT:    xsadddp f1, f1, v11
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v13
+; PWR10LE-NEXT:    xsadddp f1, f1, v12
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v18
+; PWR10LE-NEXT:    xsadddp f1, f1, v13
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v17
+; PWR10LE-NEXT:    xsadddp f1, f1, v18
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v16
+; PWR10LE-NEXT:    xsadddp f1, f1, v17
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v15
+; PWR10LE-NEXT:    xsadddp f1, f1, v16
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v14
+; PWR10LE-NEXT:    xsadddp f1, f1, v15
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v1
+; PWR10LE-NEXT:    xsadddp f1, f1, v14
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, v0
+; PWR10LE-NEXT:    xsadddp f1, f1, v1
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xxswapd v2, vs13
+; PWR10LE-NEXT:    xsadddp f1, f1, v0
+; PWR10LE-NEXT:    xsadddp f1, f1, v2
+; PWR10LE-NEXT:    xsadddp f1, f1, f13
+; PWR10LE-NEXT:    xxswapd vs13, vs12
+; PWR10LE-NEXT:    xsadddp f1, f1, f13
+; PWR10LE-NEXT:    xsadddp f1, f1, f12
+; PWR10LE-NEXT:    xxswapd vs12, vs11
+; PWR10LE-NEXT:    xsadddp f1, f1, f12
+; PWR10LE-NEXT:    xsadddp f1, f1, f11
+; PWR10LE-NEXT:    xxswapd vs11, vs10
+; PWR10LE-NEXT:    xsadddp f1, f1, f11
+; PWR10LE-NEXT:    xsadddp f1, f1, f10
+; PWR10LE-NEXT:    xxswapd vs10, vs9
+; PWR10LE-NEXT:    xsadddp f1, f1, f10
+; PWR10LE-NEXT:    xsadddp f1, f1, f9
+; PWR10LE-NEXT:    xxswapd vs9, vs8
+; PWR10LE-NEXT:    xsadddp f1, f1, f9
+; PWR10LE-NEXT:    xsadddp f1, f1, f8
+; PWR10LE-NEXT:    xxswapd vs8, vs7
+; PWR10LE-NEXT:    xsadddp f1, f1, f8
+; PWR10LE-NEXT:    xsadddp f1, f1, f7
+; PWR10LE-NEXT:    xxswapd vs7, vs6
+; PWR10LE-NEXT:    xsadddp f1, f1, f7
+; PWR10LE-NEXT:    xsadddp f1, f1, f6
+; PWR10LE-NEXT:    xxswapd vs6, vs5
+; PWR10LE-NEXT:    xsadddp f1, f1, f6
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xxswapd vs5, vs4
+; PWR10LE-NEXT:    xsadddp f1, f1, f5
+; PWR10LE-NEXT:    xsadddp f1, f1, f4
+; PWR10LE-NEXT:    xxswapd vs4, vs3
+; PWR10LE-NEXT:    xsadddp f1, f1, f4
+; PWR10LE-NEXT:    xsadddp f1, f1, f3
+; PWR10LE-NEXT:    xxswapd vs3, vs2
+; PWR10LE-NEXT:    xsadddp f1, f1, f3
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xxswapd vs2, vs0
+; PWR10LE-NEXT:    xsadddp f1, f1, f2
+; PWR10LE-NEXT:    xsadddp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v64f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v2
+; PWR10BE-NEXT:    lxv v18, 240(r1)
+; PWR10BE-NEXT:    lxv v17, 256(r1)
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v3
+; PWR10BE-NEXT:    lxv v16, 272(r1)
+; PWR10BE-NEXT:    lxv v15, 288(r1)
+; PWR10BE-NEXT:    lxv v14, 304(r1)
+; PWR10BE-NEXT:    lxv v1, 320(r1)
+; PWR10BE-NEXT:    lxv v0, 336(r1)
+; PWR10BE-NEXT:    lxv vs13, 352(r1)
+; PWR10BE-NEXT:    lxv vs12, 368(r1)
+; PWR10BE-NEXT:    lxv vs11, 384(r1)
+; PWR10BE-NEXT:    lxv vs10, 400(r1)
+; PWR10BE-NEXT:    lxv vs9, 416(r1)
+; PWR10BE-NEXT:    xsadddp f1, f1, v3
+; PWR10BE-NEXT:    lxv vs8, 432(r1)
+; PWR10BE-NEXT:    lxv vs7, 448(r1)
+; PWR10BE-NEXT:    lxv vs6, 464(r1)
+; PWR10BE-NEXT:    lxv vs5, 480(r1)
+; PWR10BE-NEXT:    lxv vs4, 496(r1)
+; PWR10BE-NEXT:    lxv vs3, 512(r1)
+; PWR10BE-NEXT:    lxv vs2, 528(r1)
+; PWR10BE-NEXT:    lxv vs0, 544(r1)
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v4
+; PWR10BE-NEXT:    xsadddp f1, f1, v4
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v5
+; PWR10BE-NEXT:    xsadddp f1, f1, v5
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v6
+; PWR10BE-NEXT:    xsadddp f1, f1, v6
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v7
+; PWR10BE-NEXT:    xsadddp f1, f1, v7
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v8
+; PWR10BE-NEXT:    xsadddp f1, f1, v8
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v9
+; PWR10BE-NEXT:    xsadddp f1, f1, v9
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v10
+; PWR10BE-NEXT:    xsadddp f1, f1, v10
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v11
+; PWR10BE-NEXT:    xsadddp f1, f1, v11
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v12
+; PWR10BE-NEXT:    xsadddp f1, f1, v12
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v13
+; PWR10BE-NEXT:    xsadddp f1, f1, v13
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v18
+; PWR10BE-NEXT:    xsadddp f1, f1, v18
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v17
+; PWR10BE-NEXT:    xsadddp f1, f1, v17
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v16
+; PWR10BE-NEXT:    xsadddp f1, f1, v16
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v15
+; PWR10BE-NEXT:    xsadddp f1, f1, v15
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v14
+; PWR10BE-NEXT:    xsadddp f1, f1, v14
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v1
+; PWR10BE-NEXT:    xsadddp f1, f1, v1
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xxswapd v2, v0
+; PWR10BE-NEXT:    xsadddp f1, f1, v0
+; PWR10BE-NEXT:    xsadddp f1, f1, v2
+; PWR10BE-NEXT:    xsadddp f1, f1, f13
+; PWR10BE-NEXT:    xxswapd vs13, vs13
+; PWR10BE-NEXT:    xsadddp f1, f1, f13
+; PWR10BE-NEXT:    xsadddp f1, f1, f12
+; PWR10BE-NEXT:    xxswapd vs12, vs12
+; PWR10BE-NEXT:    xsadddp f1, f1, f12
+; PWR10BE-NEXT:    xsadddp f1, f1, f11
+; PWR10BE-NEXT:    xxswapd vs11, vs11
+; PWR10BE-NEXT:    xsadddp f1, f1, f11
+; PWR10BE-NEXT:    xsadddp f1, f1, f10
+; PWR10BE-NEXT:    xxswapd vs10, vs10
+; PWR10BE-NEXT:    xsadddp f1, f1, f10
+; PWR10BE-NEXT:    xsadddp f1, f1, f9
+; PWR10BE-NEXT:    xxswapd vs9, vs9
+; PWR10BE-NEXT:    xsadddp f1, f1, f9
+; PWR10BE-NEXT:    xsadddp f1, f1, f8
+; PWR10BE-NEXT:    xxswapd vs8, vs8
+; PWR10BE-NEXT:    xsadddp f1, f1, f8
+; PWR10BE-NEXT:    xsadddp f1, f1, f7
+; PWR10BE-NEXT:    xxswapd vs7, vs7
+; PWR10BE-NEXT:    xsadddp f1, f1, f7
+; PWR10BE-NEXT:    xsadddp f1, f1, f6
+; PWR10BE-NEXT:    xxswapd vs6, vs6
+; PWR10BE-NEXT:    xsadddp f1, f1, f6
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xxswapd vs5, vs5
+; PWR10BE-NEXT:    xsadddp f1, f1, f5
+; PWR10BE-NEXT:    xsadddp f1, f1, f4
+; PWR10BE-NEXT:    xxswapd vs4, vs4
+; PWR10BE-NEXT:    xsadddp f1, f1, f4
+; PWR10BE-NEXT:    xsadddp f1, f1, f3
+; PWR10BE-NEXT:    xxswapd vs3, vs3
+; PWR10BE-NEXT:    xsadddp f1, f1, f3
+; PWR10BE-NEXT:    xsadddp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs2
+; PWR10BE-NEXT:    xsadddp f1, f1, f2
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    xxswapd vs0, vs0
+; PWR10BE-NEXT:    xsadddp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fadd.v64f64(double %b, <64 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v64f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs0, 368(r1)
+; PWR9LE-NEXT:    lxv vs1, 496(r1)
+; PWR9LE-NEXT:    lxv vs2, 240(r1)
+; PWR9LE-NEXT:    lxv vs3, 304(r1)
+; PWR9LE-NEXT:    xvadddp vs3, v3, vs3
+; PWR9LE-NEXT:    lxv vs4, 432(r1)
+; PWR9LE-NEXT:    lxv vs5, 400(r1)
+; PWR9LE-NEXT:    lxv vs6, 528(r1)
+; PWR9LE-NEXT:    lxv vs7, 272(r1)
+; PWR9LE-NEXT:    lxv vs8, 336(r1)
+; PWR9LE-NEXT:    lxv vs9, 464(r1)
+; PWR9LE-NEXT:    lxv vs10, 352(r1)
+; PWR9LE-NEXT:    lxv vs11, 480(r1)
+; PWR9LE-NEXT:    lxv vs12, 224(r1)
+; PWR9LE-NEXT:    lxv vs13, 288(r1)
+; PWR9LE-NEXT:    lxv v0, 416(r1)
+; PWR9LE-NEXT:    lxv v1, 384(r1)
+; PWR9LE-NEXT:    lxv v14, 512(r1)
+; PWR9LE-NEXT:    lxv v15, 256(r1)
+; PWR9LE-NEXT:    lxv v16, 320(r1)
+; PWR9LE-NEXT:    lxv v17, 448(r1)
+; PWR9LE-NEXT:    xvadddp v12, v12, v17
+; PWR9LE-NEXT:    xvadddp v4, v4, v16
+; PWR9LE-NEXT:    xvadddp v14, v15, v14
+; PWR9LE-NEXT:    xvadddp v1, v8, v1
+; PWR9LE-NEXT:    xvadddp v0, v10, v0
+; PWR9LE-NEXT:    xvadddp vs13, v2, vs13
+; PWR9LE-NEXT:    xvadddp vs11, vs12, vs11
+; PWR9LE-NEXT:    xvadddp vs10, v6, vs10
+; PWR9LE-NEXT:    xvadddp vs9, v13, vs9
+; PWR9LE-NEXT:    xvadddp vs8, v5, vs8
+; PWR9LE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR9LE-NEXT:    xvadddp vs5, v9, vs5
+; PWR9LE-NEXT:    xvadddp vs4, v11, vs4
+; PWR9LE-NEXT:    xvadddp vs1, vs2, vs1
+; PWR9LE-NEXT:    xvadddp vs0, v7, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xvadddp vs1, vs3, vs4
+; PWR9LE-NEXT:    xvadddp vs2, vs5, vs6
+; PWR9LE-NEXT:    xvadddp vs3, vs8, vs9
+; PWR9LE-NEXT:    xvadddp vs4, vs10, vs11
+; PWR9LE-NEXT:    xvadddp vs5, vs13, v0
+; PWR9LE-NEXT:    xvadddp vs6, v1, v14
+; PWR9LE-NEXT:    xvadddp vs7, v4, v12
+; PWR9LE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR9LE-NEXT:    xvadddp vs4, vs5, vs4
+; PWR9LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9LE-NEXT:    xvadddp vs1, vs4, vs6
+; PWR9LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v64f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs0, 384(r1)
+; PWR9BE-NEXT:    lxv vs1, 512(r1)
+; PWR9BE-NEXT:    lxv vs2, 256(r1)
+; PWR9BE-NEXT:    lxv vs3, 320(r1)
+; PWR9BE-NEXT:    xvadddp vs3, v3, vs3
+; PWR9BE-NEXT:    lxv vs4, 448(r1)
+; PWR9BE-NEXT:    lxv vs5, 416(r1)
+; PWR9BE-NEXT:    lxv vs6, 544(r1)
+; PWR9BE-NEXT:    lxv vs7, 288(r1)
+; PWR9BE-NEXT:    lxv vs8, 352(r1)
+; PWR9BE-NEXT:    lxv vs9, 480(r1)
+; PWR9BE-NEXT:    lxv vs10, 368(r1)
+; PWR9BE-NEXT:    lxv vs11, 496(r1)
+; PWR9BE-NEXT:    lxv vs12, 240(r1)
+; PWR9BE-NEXT:    lxv vs13, 304(r1)
+; PWR9BE-NEXT:    lxv v0, 432(r1)
+; PWR9BE-NEXT:    lxv v1, 400(r1)
+; PWR9BE-NEXT:    lxv v14, 528(r1)
+; PWR9BE-NEXT:    lxv v15, 272(r1)
+; PWR9BE-NEXT:    lxv v16, 336(r1)
+; PWR9BE-NEXT:    lxv v17, 464(r1)
+; PWR9BE-NEXT:    xvadddp v12, v12, v17
+; PWR9BE-NEXT:    xvadddp v4, v4, v16
+; PWR9BE-NEXT:    xvadddp v14, v15, v14
+; PWR9BE-NEXT:    xvadddp v1, v8, v1
+; PWR9BE-NEXT:    xvadddp v0, v10, v0
+; PWR9BE-NEXT:    xvadddp vs13, v2, vs13
+; PWR9BE-NEXT:    xvadddp vs11, vs12, vs11
+; PWR9BE-NEXT:    xvadddp vs10, v6, vs10
+; PWR9BE-NEXT:    xvadddp vs9, v13, vs9
+; PWR9BE-NEXT:    xvadddp vs8, v5, vs8
+; PWR9BE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR9BE-NEXT:    xvadddp vs5, v9, vs5
+; PWR9BE-NEXT:    xvadddp vs4, v11, vs4
+; PWR9BE-NEXT:    xvadddp vs1, vs2, vs1
+; PWR9BE-NEXT:    xvadddp vs0, v7, vs0
+; PWR9BE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR9BE-NEXT:    xvadddp vs1, vs3, vs4
+; PWR9BE-NEXT:    xvadddp vs2, vs5, vs6
+; PWR9BE-NEXT:    xvadddp vs3, vs8, vs9
+; PWR9BE-NEXT:    xvadddp vs4, vs10, vs11
+; PWR9BE-NEXT:    xvadddp vs5, vs13, v0
+; PWR9BE-NEXT:    xvadddp vs6, v1, v14
+; PWR9BE-NEXT:    xvadddp vs7, v4, v12
+; PWR9BE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR9BE-NEXT:    xvadddp vs4, vs5, vs4
+; PWR9BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR9BE-NEXT:    xvadddp vs1, vs4, vs6
+; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v64f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs0, 368(r1)
+; PWR10LE-NEXT:    lxv vs1, 496(r1)
+; PWR10LE-NEXT:    xvadddp vs0, v7, vs0
+; PWR10LE-NEXT:    lxv vs2, 240(r1)
+; PWR10LE-NEXT:    lxv vs3, 304(r1)
+; PWR10LE-NEXT:    lxv vs4, 432(r1)
+; PWR10LE-NEXT:    lxv vs5, 400(r1)
+; PWR10LE-NEXT:    lxv vs6, 528(r1)
+; PWR10LE-NEXT:    lxv vs7, 272(r1)
+; PWR10LE-NEXT:    lxv vs8, 336(r1)
+; PWR10LE-NEXT:    lxv vs9, 464(r1)
+; PWR10LE-NEXT:    lxv vs10, 352(r1)
+; PWR10LE-NEXT:    lxv vs11, 480(r1)
+; PWR10LE-NEXT:    lxv vs12, 224(r1)
+; PWR10LE-NEXT:    lxv vs13, 288(r1)
+; PWR10LE-NEXT:    xvadddp vs13, v2, vs13
+; PWR10LE-NEXT:    xvadddp vs11, vs12, vs11
+; PWR10LE-NEXT:    xvadddp vs10, v6, vs10
+; PWR10LE-NEXT:    xvadddp vs9, v13, vs9
+; PWR10LE-NEXT:    xvadddp vs8, v5, vs8
+; PWR10LE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR10LE-NEXT:    xvadddp vs5, v9, vs5
+; PWR10LE-NEXT:    xvadddp vs4, v11, vs4
+; PWR10LE-NEXT:    xvadddp vs3, v3, vs3
+; PWR10LE-NEXT:    xvadddp vs1, vs2, vs1
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    lxv v0, 416(r1)
+; PWR10LE-NEXT:    lxv v1, 384(r1)
+; PWR10LE-NEXT:    lxv v14, 512(r1)
+; PWR10LE-NEXT:    lxv v15, 256(r1)
+; PWR10LE-NEXT:    lxv v16, 320(r1)
+; PWR10LE-NEXT:    lxv v17, 448(r1)
+; PWR10LE-NEXT:    xvadddp v12, v12, v17
+; PWR10LE-NEXT:    xvadddp v4, v4, v16
+; PWR10LE-NEXT:    xvadddp v14, v15, v14
+; PWR10LE-NEXT:    xvadddp v1, v8, v1
+; PWR10LE-NEXT:    xvadddp v0, v10, v0
+; PWR10LE-NEXT:    xvadddp vs1, vs3, vs4
+; PWR10LE-NEXT:    xvadddp vs2, vs5, vs6
+; PWR10LE-NEXT:    xvadddp vs3, vs8, vs9
+; PWR10LE-NEXT:    xvadddp vs4, vs10, vs11
+; PWR10LE-NEXT:    xvadddp vs5, vs13, v0
+; PWR10LE-NEXT:    xvadddp vs6, v1, v14
+; PWR10LE-NEXT:    xvadddp vs7, v4, v12
+; PWR10LE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR10LE-NEXT:    xvadddp vs4, vs5, vs4
+; PWR10LE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10LE-NEXT:    xvadddp vs1, vs4, vs6
+; PWR10LE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v64f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs0, 384(r1)
+; PWR10BE-NEXT:    lxv vs1, 512(r1)
+; PWR10BE-NEXT:    xvadddp vs0, v7, vs0
+; PWR10BE-NEXT:    lxv vs2, 256(r1)
+; PWR10BE-NEXT:    lxv vs3, 320(r1)
+; PWR10BE-NEXT:    lxv vs4, 448(r1)
+; PWR10BE-NEXT:    lxv vs5, 416(r1)
+; PWR10BE-NEXT:    lxv vs6, 544(r1)
+; PWR10BE-NEXT:    lxv vs7, 288(r1)
+; PWR10BE-NEXT:    lxv vs8, 352(r1)
+; PWR10BE-NEXT:    lxv vs9, 480(r1)
+; PWR10BE-NEXT:    lxv vs10, 368(r1)
+; PWR10BE-NEXT:    lxv vs11, 496(r1)
+; PWR10BE-NEXT:    lxv vs12, 240(r1)
+; PWR10BE-NEXT:    lxv vs13, 304(r1)
+; PWR10BE-NEXT:    xvadddp vs13, v2, vs13
+; PWR10BE-NEXT:    xvadddp vs11, vs12, vs11
+; PWR10BE-NEXT:    xvadddp vs10, v6, vs10
+; PWR10BE-NEXT:    xvadddp vs9, v13, vs9
+; PWR10BE-NEXT:    xvadddp vs8, v5, vs8
+; PWR10BE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR10BE-NEXT:    xvadddp vs5, v9, vs5
+; PWR10BE-NEXT:    xvadddp vs4, v11, vs4
+; PWR10BE-NEXT:    xvadddp vs3, v3, vs3
+; PWR10BE-NEXT:    xvadddp vs1, vs2, vs1
+; PWR10BE-NEXT:    xvadddp vs0, vs0, vs1
+; PWR10BE-NEXT:    lxv v0, 432(r1)
+; PWR10BE-NEXT:    lxv v1, 400(r1)
+; PWR10BE-NEXT:    lxv v14, 528(r1)
+; PWR10BE-NEXT:    lxv v15, 272(r1)
+; PWR10BE-NEXT:    lxv v16, 336(r1)
+; PWR10BE-NEXT:    lxv v17, 464(r1)
+; PWR10BE-NEXT:    xvadddp v12, v12, v17
+; PWR10BE-NEXT:    xvadddp v4, v4, v16
+; PWR10BE-NEXT:    xvadddp v14, v15, v14
+; PWR10BE-NEXT:    xvadddp v1, v8, v1
+; PWR10BE-NEXT:    xvadddp v0, v10, v0
+; PWR10BE-NEXT:    xvadddp vs1, vs3, vs4
+; PWR10BE-NEXT:    xvadddp vs2, vs5, vs6
+; PWR10BE-NEXT:    xvadddp vs3, vs8, vs9
+; PWR10BE-NEXT:    xvadddp vs4, vs10, vs11
+; PWR10BE-NEXT:    xvadddp vs5, vs13, v0
+; PWR10BE-NEXT:    xvadddp vs6, v1, v14
+; PWR10BE-NEXT:    xvadddp vs7, v4, v12
+; PWR10BE-NEXT:    xvadddp vs6, vs7, vs6
+; PWR10BE-NEXT:    xvadddp vs4, vs5, vs4
+; PWR10BE-NEXT:    xvadddp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
+; PWR10BE-NEXT:    xvadddp vs1, vs4, vs6
+; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a)
+  ret double %0
+}
+
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) #0
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) #0
+declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) #0
+declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) #0
+declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) #0
+declare double @llvm.vector.reduce.fadd.v64f64(double, <64 x double>) #0
+
+;;
+;; Vectors of ppc_fp128
+;;
+define dso_local ppc_fp128 @v2ppcf128(<2 x ppc_fp128> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2ppcf128:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -32(r1)
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    addi r1, r1, 32
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2ppcf128:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -112(r1)
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    addi r1, r1, 112
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2ppcf128:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -32(r1)
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    addi r1, r1, 32
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2ppcf128:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -112(r1)
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    addi r1, r1, 112
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <2 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+define dso_local ppc_fp128 @v2ppcf128_b(<2 x ppc_fp128> %a, ppc_fp128 %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2ppcf128_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -48(r1)
+; PWR9LE-NEXT:    fmr f31, f4
+; PWR9LE-NEXT:    fmr f30, f3
+; PWR9LE-NEXT:    fmr f4, f2
+; PWR9LE-NEXT:    fmr f3, f1
+; PWR9LE-NEXT:    fmr f1, f5
+; PWR9LE-NEXT:    fmr f2, f6
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f30
+; PWR9LE-NEXT:    fmr f4, f31
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    addi r1, r1, 48
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2ppcf128_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -128(r1)
+; PWR9BE-NEXT:    stfd f30, 112(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f31, 120(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f31, f4
+; PWR9BE-NEXT:    fmr f30, f3
+; PWR9BE-NEXT:    fmr f4, f2
+; PWR9BE-NEXT:    fmr f3, f1
+; PWR9BE-NEXT:    fmr f1, f5
+; PWR9BE-NEXT:    fmr f2, f6
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f30
+; PWR9BE-NEXT:    fmr f4, f31
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    lfd f31, 120(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f30, 112(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    addi r1, r1, 128
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2ppcf128_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -48(r1)
+; PWR10LE-NEXT:    fmr f31, f4
+; PWR10LE-NEXT:    fmr f30, f3
+; PWR10LE-NEXT:    fmr f4, f2
+; PWR10LE-NEXT:    fmr f3, f1
+; PWR10LE-NEXT:    fmr f1, f5
+; PWR10LE-NEXT:    fmr f2, f6
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f30
+; PWR10LE-NEXT:    fmr f4, f31
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    addi r1, r1, 48
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2ppcf128_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -128(r1)
+; PWR10BE-NEXT:    stfd f30, 112(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f31, 120(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f31, f4
+; PWR10BE-NEXT:    fmr f30, f3
+; PWR10BE-NEXT:    fmr f4, f2
+; PWR10BE-NEXT:    fmr f3, f1
+; PWR10BE-NEXT:    fmr f1, f5
+; PWR10BE-NEXT:    fmr f2, f6
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f30
+; PWR10BE-NEXT:    fmr f4, f31
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    lfd f31, 120(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f30, 112(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    addi r1, r1, 128
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 %b, <2 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2ppcf128_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -64(r1)
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    stfd f2, 40(r1)
+; PWR9LE-NEXT:    stfd f1, 32(r1)
+; PWR9LE-NEXT:    lxv vs1, 32(r1)
+; PWR9LE-NEXT:    xxswapd vs2, vs1
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR9LE-NEXT:    addi r1, r1, 64
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2ppcf128_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -144(r1)
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    stfd f2, 120(r1)
+; PWR9BE-NEXT:    stfd f1, 112(r1)
+; PWR9BE-NEXT:    lxv vs1, 112(r1)
+; PWR9BE-NEXT:    xxswapd vs2, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR9BE-NEXT:    addi r1, r1, 144
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2ppcf128_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -64(r1)
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    stfd f2, 40(r1)
+; PWR10LE-NEXT:    stfd f1, 32(r1)
+; PWR10LE-NEXT:    lxv vs1, 32(r1)
+; PWR10LE-NEXT:    xxswapd vs2, vs1
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR10LE-NEXT:    addi r1, r1, 64
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2ppcf128_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -144(r1)
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    stfd f2, 120(r1)
+; PWR10BE-NEXT:    stfd f1, 112(r1)
+; PWR10BE-NEXT:    lxv vs1, 112(r1)
+; PWR10BE-NEXT:    xxswapd vs2, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR10BE-NEXT:    addi r1, r1, 144
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <2 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+define dso_local ppc_fp128 @v4ppcf128(<4 x ppc_fp128> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4ppcf128:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -64(r1)
+; PWR9LE-NEXT:    fmr f31, f8
+; PWR9LE-NEXT:    fmr f30, f7
+; PWR9LE-NEXT:    fmr f29, f6
+; PWR9LE-NEXT:    fmr f28, f5
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f28
+; PWR9LE-NEXT:    fmr f4, f29
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f30
+; PWR9LE-NEXT:    fmr f4, f31
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    addi r1, r1, 64
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4ppcf128:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -144(r1)
+; PWR9BE-NEXT:    stfd f28, 112(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f29, 120(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f30, 128(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f31, 136(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f31, f8
+; PWR9BE-NEXT:    fmr f30, f7
+; PWR9BE-NEXT:    fmr f29, f6
+; PWR9BE-NEXT:    fmr f28, f5
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f28
+; PWR9BE-NEXT:    fmr f4, f29
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f30
+; PWR9BE-NEXT:    fmr f4, f31
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    lfd f31, 136(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f30, 128(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f29, 120(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f28, 112(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    addi r1, r1, 144
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4ppcf128:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -64(r1)
+; PWR10LE-NEXT:    fmr f31, f8
+; PWR10LE-NEXT:    fmr f30, f7
+; PWR10LE-NEXT:    fmr f29, f6
+; PWR10LE-NEXT:    fmr f28, f5
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f28
+; PWR10LE-NEXT:    fmr f4, f29
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f30
+; PWR10LE-NEXT:    fmr f4, f31
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    addi r1, r1, 64
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4ppcf128:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -144(r1)
+; PWR10BE-NEXT:    stfd f28, 112(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f29, 120(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f29, f6
+; PWR10BE-NEXT:    fmr f28, f5
+; PWR10BE-NEXT:    stfd f30, 128(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f31, 136(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f31, f8
+; PWR10BE-NEXT:    fmr f30, f7
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f28
+; PWR10BE-NEXT:    fmr f4, f29
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f30
+; PWR10BE-NEXT:    fmr f4, f31
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    lfd f31, 136(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f30, 128(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f29, 120(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f28, 112(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    addi r1, r1, 144
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <4 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+define dso_local ppc_fp128 @v4ppcf128_b(<4 x ppc_fp128> %a, ppc_fp128 %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4ppcf128_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -80(r1)
+; PWR9LE-NEXT:    fmr f27, f4
+; PWR9LE-NEXT:    fmr f26, f3
+; PWR9LE-NEXT:    fmr f4, f2
+; PWR9LE-NEXT:    fmr f3, f1
+; PWR9LE-NEXT:    fmr f1, f9
+; PWR9LE-NEXT:    fmr f2, f10
+; PWR9LE-NEXT:    fmr f31, f8
+; PWR9LE-NEXT:    fmr f30, f7
+; PWR9LE-NEXT:    fmr f29, f6
+; PWR9LE-NEXT:    fmr f28, f5
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f26
+; PWR9LE-NEXT:    fmr f4, f27
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f28
+; PWR9LE-NEXT:    fmr f4, f29
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f30
+; PWR9LE-NEXT:    fmr f4, f31
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    addi r1, r1, 80
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4ppcf128_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -160(r1)
+; PWR9BE-NEXT:    stfd f26, 112(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f27, 120(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f27, f4
+; PWR9BE-NEXT:    fmr f26, f3
+; PWR9BE-NEXT:    fmr f4, f2
+; PWR9BE-NEXT:    fmr f3, f1
+; PWR9BE-NEXT:    fmr f1, f9
+; PWR9BE-NEXT:    fmr f2, f10
+; PWR9BE-NEXT:    stfd f28, 128(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f29, 136(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f30, 144(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f31, 152(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f31, f8
+; PWR9BE-NEXT:    fmr f30, f7
+; PWR9BE-NEXT:    fmr f29, f6
+; PWR9BE-NEXT:    fmr f28, f5
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f26
+; PWR9BE-NEXT:    fmr f4, f27
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f28
+; PWR9BE-NEXT:    fmr f4, f29
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f30
+; PWR9BE-NEXT:    fmr f4, f31
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    lfd f31, 152(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f30, 144(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f29, 136(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f28, 128(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f27, 120(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f26, 112(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    addi r1, r1, 160
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4ppcf128_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -80(r1)
+; PWR10LE-NEXT:    fmr f27, f4
+; PWR10LE-NEXT:    fmr f26, f3
+; PWR10LE-NEXT:    fmr f4, f2
+; PWR10LE-NEXT:    fmr f3, f1
+; PWR10LE-NEXT:    fmr f1, f9
+; PWR10LE-NEXT:    fmr f2, f10
+; PWR10LE-NEXT:    fmr f31, f8
+; PWR10LE-NEXT:    fmr f30, f7
+; PWR10LE-NEXT:    fmr f29, f6
+; PWR10LE-NEXT:    fmr f28, f5
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f26
+; PWR10LE-NEXT:    fmr f4, f27
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f28
+; PWR10LE-NEXT:    fmr f4, f29
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f30
+; PWR10LE-NEXT:    fmr f4, f31
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    addi r1, r1, 80
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4ppcf128_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -160(r1)
+; PWR10BE-NEXT:    stfd f26, 112(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f27, 120(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f27, f4
+; PWR10BE-NEXT:    fmr f26, f3
+; PWR10BE-NEXT:    fmr f4, f2
+; PWR10BE-NEXT:    fmr f3, f1
+; PWR10BE-NEXT:    fmr f1, f9
+; PWR10BE-NEXT:    stfd f28, 128(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f29, 136(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f2, f10
+; PWR10BE-NEXT:    fmr f29, f6
+; PWR10BE-NEXT:    fmr f28, f5
+; PWR10BE-NEXT:    stfd f30, 144(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f31, 152(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f31, f8
+; PWR10BE-NEXT:    fmr f30, f7
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f26
+; PWR10BE-NEXT:    fmr f4, f27
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f28
+; PWR10BE-NEXT:    fmr f4, f29
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f30
+; PWR10BE-NEXT:    fmr f4, f31
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    lfd f31, 152(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f30, 144(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f29, 136(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f28, 128(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f27, 120(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f26, 112(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    addi r1, r1, 160
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 %b, <4 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4ppcf128_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    mflr r0
+; PWR9LE-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR9LE-NEXT:    std r0, 16(r1)
+; PWR9LE-NEXT:    stdu r1, -96(r1)
+; PWR9LE-NEXT:    fmr f29, f4
+; PWR9LE-NEXT:    fmr f28, f3
+; PWR9LE-NEXT:    fmr f3, f5
+; PWR9LE-NEXT:    fmr f4, f6
+; PWR9LE-NEXT:    fmr f31, f8
+; PWR9LE-NEXT:    fmr f30, f7
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f27, f1
+; PWR9LE-NEXT:    fmr f26, f2
+; PWR9LE-NEXT:    fmr f1, f28
+; PWR9LE-NEXT:    fmr f2, f29
+; PWR9LE-NEXT:    fmr f3, f30
+; PWR9LE-NEXT:    fmr f4, f31
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    fmr f3, f1
+; PWR9LE-NEXT:    fmr f4, f2
+; PWR9LE-NEXT:    fmr f1, f27
+; PWR9LE-NEXT:    fmr f2, f26
+; PWR9LE-NEXT:    bl __gcc_qadd
+; PWR9LE-NEXT:    nop
+; PWR9LE-NEXT:    stfd f2, 40(r1)
+; PWR9LE-NEXT:    stfd f1, 32(r1)
+; PWR9LE-NEXT:    lxv vs1, 32(r1)
+; PWR9LE-NEXT:    xxswapd vs2, vs1
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR9LE-NEXT:    addi r1, r1, 96
+; PWR9LE-NEXT:    ld r0, 16(r1)
+; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    mtlr r0
+; PWR9LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4ppcf128_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    mflr r0
+; PWR9BE-NEXT:    std r0, 16(r1)
+; PWR9BE-NEXT:    stdu r1, -176(r1)
+; PWR9BE-NEXT:    stfd f28, 144(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f29, 152(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f29, f4
+; PWR9BE-NEXT:    fmr f28, f3
+; PWR9BE-NEXT:    fmr f3, f5
+; PWR9BE-NEXT:    fmr f4, f6
+; PWR9BE-NEXT:    stfd f26, 128(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f27, 136(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f30, 160(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    stfd f31, 168(r1) # 8-byte Folded Spill
+; PWR9BE-NEXT:    fmr f31, f8
+; PWR9BE-NEXT:    fmr f30, f7
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f27, f1
+; PWR9BE-NEXT:    fmr f26, f2
+; PWR9BE-NEXT:    fmr f1, f28
+; PWR9BE-NEXT:    fmr f2, f29
+; PWR9BE-NEXT:    fmr f3, f30
+; PWR9BE-NEXT:    fmr f4, f31
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    fmr f3, f1
+; PWR9BE-NEXT:    fmr f4, f2
+; PWR9BE-NEXT:    fmr f1, f27
+; PWR9BE-NEXT:    fmr f2, f26
+; PWR9BE-NEXT:    bl __gcc_qadd
+; PWR9BE-NEXT:    nop
+; PWR9BE-NEXT:    stfd f2, 120(r1)
+; PWR9BE-NEXT:    stfd f1, 112(r1)
+; PWR9BE-NEXT:    lxv vs1, 112(r1)
+; PWR9BE-NEXT:    lfd f31, 168(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f30, 160(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    xxswapd vs2, vs1
+; PWR9BE-NEXT:    lfd f29, 152(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f28, 144(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f27, 136(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR9BE-NEXT:    addi r1, r1, 176
+; PWR9BE-NEXT:    ld r0, 16(r1)
+; PWR9BE-NEXT:    mtlr r0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4ppcf128_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    mflr r0
+; PWR10LE-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
+; PWR10LE-NEXT:    std r0, 16(r1)
+; PWR10LE-NEXT:    stdu r1, -96(r1)
+; PWR10LE-NEXT:    fmr f29, f4
+; PWR10LE-NEXT:    fmr f28, f3
+; PWR10LE-NEXT:    fmr f3, f5
+; PWR10LE-NEXT:    fmr f4, f6
+; PWR10LE-NEXT:    fmr f31, f8
+; PWR10LE-NEXT:    fmr f30, f7
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f27, f1
+; PWR10LE-NEXT:    fmr f26, f2
+; PWR10LE-NEXT:    fmr f1, f28
+; PWR10LE-NEXT:    fmr f2, f29
+; PWR10LE-NEXT:    fmr f3, f30
+; PWR10LE-NEXT:    fmr f4, f31
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    fmr f3, f1
+; PWR10LE-NEXT:    fmr f4, f2
+; PWR10LE-NEXT:    fmr f1, f27
+; PWR10LE-NEXT:    fmr f2, f26
+; PWR10LE-NEXT:    bl __gcc_qadd@notoc
+; PWR10LE-NEXT:    stfd f2, 40(r1)
+; PWR10LE-NEXT:    stfd f1, 32(r1)
+; PWR10LE-NEXT:    lxv vs1, 32(r1)
+; PWR10LE-NEXT:    xxswapd vs2, vs1
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR10LE-NEXT:    addi r1, r1, 96
+; PWR10LE-NEXT:    ld r0, 16(r1)
+; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    mtlr r0
+; PWR10LE-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4ppcf128_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    mflr r0
+; PWR10BE-NEXT:    std r0, 16(r1)
+; PWR10BE-NEXT:    stdu r1, -176(r1)
+; PWR10BE-NEXT:    stfd f28, 144(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f29, 152(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f29, f4
+; PWR10BE-NEXT:    fmr f28, f3
+; PWR10BE-NEXT:    fmr f3, f5
+; PWR10BE-NEXT:    fmr f4, f6
+; PWR10BE-NEXT:    stfd f26, 128(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f27, 136(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f30, 160(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    stfd f31, 168(r1) # 8-byte Folded Spill
+; PWR10BE-NEXT:    fmr f31, f8
+; PWR10BE-NEXT:    fmr f30, f7
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f27, f1
+; PWR10BE-NEXT:    fmr f26, f2
+; PWR10BE-NEXT:    fmr f1, f28
+; PWR10BE-NEXT:    fmr f2, f29
+; PWR10BE-NEXT:    fmr f3, f30
+; PWR10BE-NEXT:    fmr f4, f31
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    fmr f3, f1
+; PWR10BE-NEXT:    fmr f4, f2
+; PWR10BE-NEXT:    fmr f1, f27
+; PWR10BE-NEXT:    fmr f2, f26
+; PWR10BE-NEXT:    bl __gcc_qadd
+; PWR10BE-NEXT:    nop
+; PWR10BE-NEXT:    stfd f2, 120(r1)
+; PWR10BE-NEXT:    stfd f1, 112(r1)
+; PWR10BE-NEXT:    lfd f31, 168(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f30, 160(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f29, 152(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f28, 144(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f27, 136(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
+; PWR10BE-NEXT:    lxv vs1, 112(r1)
+; PWR10BE-NEXT:    xxswapd vs2, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; PWR10BE-NEXT:    addi r1, r1, 176
+; PWR10BE-NEXT:    ld r0, 16(r1)
+; PWR10BE-NEXT:    mtlr r0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <4 x ppc_fp128> %a)
+  ret ppc_fp128 %0
+}
+
+declare ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128, <2 x ppc_fp128>) #0
+declare ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128, <4 x ppc_fp128>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
@@ -0,0 +1,1169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of f32
+;;
+define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    xvmaxsp vs0, v2, vs0
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    xvmaxsp vs0, v2, vs0
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    xvmaxsp vs0, v2, vs0
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    xvmaxsp vs0, v2, vs0
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs2, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs3, v2
+; PWR9LE-NEXT:    xscvspdpn f0, v2
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f3, vs3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmaxdp f2, f2, f3
+; PWR9LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f3, v2
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmaxdp f2, f3, f2
+; PWR9BE-NEXT:    xsmaxdp f1, f2, f1
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs2, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs3, v2
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f0, v2
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xscvspdpn f3, vs3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmaxdp f2, f2, f3
+; PWR10LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f3, v2
+; PWR10BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmaxdp f2, f3, f2
+; PWR10BE-NEXT:    xsmaxdp f1, f2, f1
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR9LE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR9LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR9BE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR9BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR10LE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR10LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR10BE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR10BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR9LE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR9LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR9BE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR9BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR10LE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR10LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v3, v5
+; PWR10BE-NEXT:    xvmaxsp vs1, v2, v4
+; PWR10BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v32f32(<32 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v5, v9
+; PWR9LE-NEXT:    xvmaxsp vs1, v3, v7
+; PWR9LE-NEXT:    xvmaxsp vs2, v2, v6
+; PWR9LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmaxsp vs1, v4, v8
+; PWR9LE-NEXT:    xvmaxsp vs1, vs2, vs1
+; PWR9LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v5, v9
+; PWR9BE-NEXT:    xvmaxsp vs1, v3, v7
+; PWR9BE-NEXT:    xvmaxsp vs2, v2, v6
+; PWR9BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmaxsp vs1, v4, v8
+; PWR9BE-NEXT:    xvmaxsp vs1, vs2, vs1
+; PWR9BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR9BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v5, v9
+; PWR10LE-NEXT:    xvmaxsp vs1, v3, v7
+; PWR10LE-NEXT:    xvmaxsp vs2, v2, v6
+; PWR10LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmaxsp vs1, v4, v8
+; PWR10LE-NEXT:    xvmaxsp vs1, vs2, vs1
+; PWR10LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v5, v9
+; PWR10BE-NEXT:    xvmaxsp vs1, v3, v7
+; PWR10BE-NEXT:    xvmaxsp vs2, v2, v6
+; PWR10BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmaxsp vs1, v4, v8
+; PWR10BE-NEXT:    xvmaxsp vs1, vs2, vs1
+; PWR10BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f2
+; PWR10BE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v32f32_fast(<32 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxsp vs0, v4, v8
+; PWR9LE-NEXT:    xvmaxsp vs1, v2, v6
+; PWR9LE-NEXT:    xvmaxsp vs2, v5, v9
+; PWR9LE-NEXT:    xvmaxsp vs3, v3, v7
+; PWR9LE-NEXT:    xvmaxsp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxsp vs0, v4, v8
+; PWR9BE-NEXT:    xvmaxsp vs1, v2, v6
+; PWR9BE-NEXT:    xvmaxsp vs2, v5, v9
+; PWR9BE-NEXT:    xvmaxsp vs3, v3, v7
+; PWR9BE-NEXT:    xvmaxsp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxsp vs0, v4, v8
+; PWR10LE-NEXT:    xvmaxsp vs1, v2, v6
+; PWR10LE-NEXT:    xvmaxsp vs2, v5, v9
+; PWR10LE-NEXT:    xvmaxsp vs3, v3, v7
+; PWR10LE-NEXT:    xvmaxsp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxsp vs0, v4, v8
+; PWR10BE-NEXT:    xvmaxsp vs1, v2, v6
+; PWR10BE-NEXT:    xvmaxsp vs2, v5, v9
+; PWR10BE-NEXT:    xvmaxsp vs3, v3, v7
+; PWR10BE-NEXT:    xvmaxsp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmaxsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmaxsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> %a)
+  ret float %0
+}
+
+declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) #0
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) #0
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) #0
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) #0
+declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) #0
+
+;;
+;; Vectors of f64
+;;
+define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmaxdp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xsmaxdp f1, v2, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmaxdp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xsmaxdp f1, v2, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xvmaxdp vs0, v2, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xvmaxdp vs1, v2, vs0
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xvmaxdp vs0, v2, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xvmaxdp vs1, v2, vs0
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR9LE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR9BE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR10LE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR10BE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR9LE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR9BE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR10LE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v3, v5
+; PWR10BE-NEXT:    xvmaxdp vs1, v2, v4
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v5, v9
+; PWR9LE-NEXT:    xvmaxdp vs1, v3, v7
+; PWR9LE-NEXT:    xvmaxdp vs2, v2, v6
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs1, v4, v8
+; PWR9LE-NEXT:    xvmaxdp vs1, vs2, vs1
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v5, v9
+; PWR9BE-NEXT:    xvmaxdp vs1, v3, v7
+; PWR9BE-NEXT:    xvmaxdp vs2, v2, v6
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, v4, v8
+; PWR9BE-NEXT:    xvmaxdp vs1, vs2, vs1
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v5, v9
+; PWR10LE-NEXT:    xvmaxdp vs1, v3, v7
+; PWR10LE-NEXT:    xvmaxdp vs2, v2, v6
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs1, v4, v8
+; PWR10LE-NEXT:    xvmaxdp vs1, vs2, vs1
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v5, v9
+; PWR10BE-NEXT:    xvmaxdp vs1, v3, v7
+; PWR10BE-NEXT:    xvmaxdp vs2, v2, v6
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, v4, v8
+; PWR10BE-NEXT:    xvmaxdp vs1, vs2, vs1
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmaxdp vs0, v4, v8
+; PWR9LE-NEXT:    xvmaxdp vs1, v2, v6
+; PWR9LE-NEXT:    xvmaxdp vs2, v5, v9
+; PWR9LE-NEXT:    xvmaxdp vs3, v3, v7
+; PWR9LE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmaxdp vs0, v4, v8
+; PWR9BE-NEXT:    xvmaxdp vs1, v2, v6
+; PWR9BE-NEXT:    xvmaxdp vs2, v5, v9
+; PWR9BE-NEXT:    xvmaxdp vs3, v3, v7
+; PWR9BE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmaxdp vs0, v4, v8
+; PWR10LE-NEXT:    xvmaxdp vs1, v2, v6
+; PWR10LE-NEXT:    xvmaxdp vs2, v5, v9
+; PWR10LE-NEXT:    xvmaxdp vs3, v3, v7
+; PWR10LE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmaxdp vs0, v4, v8
+; PWR10BE-NEXT:    xvmaxdp vs1, v2, v6
+; PWR10BE-NEXT:    xvmaxdp vs2, v5, v9
+; PWR10BE-NEXT:    xvmaxdp vs3, v3, v7
+; PWR10BE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs3, 272(r1)
+; PWR9LE-NEXT:    lxv vs2, 240(r1)
+; PWR9LE-NEXT:    xvmaxdp vs4, v5, v13
+; PWR9LE-NEXT:    lxv vs1, 256(r1)
+; PWR9LE-NEXT:    lxv vs0, 224(r1)
+; PWR9LE-NEXT:    xvmaxdp vs3, v9, vs3
+; PWR9LE-NEXT:    xvmaxdp vs2, v7, vs2
+; PWR9LE-NEXT:    xvmaxdp vs1, v8, vs1
+; PWR9LE-NEXT:    xvmaxdp vs0, v6, vs0
+; PWR9LE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR9LE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR9LE-NEXT:    xvmaxdp vs2, vs4, vs2
+; PWR9LE-NEXT:    xvmaxdp vs2, vs2, vs3
+; PWR9LE-NEXT:    xvmaxdp vs3, v4, v12
+; PWR9LE-NEXT:    xvmaxdp vs1, vs3, vs1
+; PWR9LE-NEXT:    xvmaxdp vs3, v2, v10
+; PWR9LE-NEXT:    xvmaxdp vs0, vs3, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs3, 288(r1)
+; PWR9BE-NEXT:    lxv vs2, 256(r1)
+; PWR9BE-NEXT:    xvmaxdp vs4, v5, v13
+; PWR9BE-NEXT:    lxv vs1, 272(r1)
+; PWR9BE-NEXT:    lxv vs0, 240(r1)
+; PWR9BE-NEXT:    xvmaxdp vs3, v9, vs3
+; PWR9BE-NEXT:    xvmaxdp vs2, v7, vs2
+; PWR9BE-NEXT:    xvmaxdp vs1, v8, vs1
+; PWR9BE-NEXT:    xvmaxdp vs0, v6, vs0
+; PWR9BE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR9BE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR9BE-NEXT:    xvmaxdp vs2, vs4, vs2
+; PWR9BE-NEXT:    xvmaxdp vs2, vs2, vs3
+; PWR9BE-NEXT:    xvmaxdp vs3, v4, v12
+; PWR9BE-NEXT:    xvmaxdp vs1, vs3, vs1
+; PWR9BE-NEXT:    xvmaxdp vs3, v2, v10
+; PWR9BE-NEXT:    xvmaxdp vs0, vs3, vs0
+; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs3, 272(r1)
+; PWR10LE-NEXT:    lxv vs2, 240(r1)
+; PWR10LE-NEXT:    xvmaxdp vs4, v5, v13
+; PWR10LE-NEXT:    xvmaxdp vs3, v9, vs3
+; PWR10LE-NEXT:    lxv vs1, 256(r1)
+; PWR10LE-NEXT:    xvmaxdp vs2, v7, vs2
+; PWR10LE-NEXT:    lxv vs0, 224(r1)
+; PWR10LE-NEXT:    xvmaxdp vs1, v8, vs1
+; PWR10LE-NEXT:    xvmaxdp vs0, v6, vs0
+; PWR10LE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR10LE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR10LE-NEXT:    xvmaxdp vs2, vs4, vs2
+; PWR10LE-NEXT:    xvmaxdp vs2, vs2, vs3
+; PWR10LE-NEXT:    xvmaxdp vs3, v4, v12
+; PWR10LE-NEXT:    xvmaxdp vs1, vs3, vs1
+; PWR10LE-NEXT:    xvmaxdp vs3, v2, v10
+; PWR10LE-NEXT:    xvmaxdp vs0, vs3, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmaxdp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs3, 288(r1)
+; PWR10BE-NEXT:    lxv vs2, 256(r1)
+; PWR10BE-NEXT:    xvmaxdp vs4, v5, v13
+; PWR10BE-NEXT:    xvmaxdp vs3, v9, vs3
+; PWR10BE-NEXT:    lxv vs1, 272(r1)
+; PWR10BE-NEXT:    xvmaxdp vs2, v7, vs2
+; PWR10BE-NEXT:    lxv vs0, 240(r1)
+; PWR10BE-NEXT:    xvmaxdp vs1, v8, vs1
+; PWR10BE-NEXT:    xvmaxdp vs0, v6, vs0
+; PWR10BE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR10BE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR10BE-NEXT:    xvmaxdp vs2, vs4, vs2
+; PWR10BE-NEXT:    xvmaxdp vs2, vs2, vs3
+; PWR10BE-NEXT:    xvmaxdp vs3, v4, v12
+; PWR10BE-NEXT:    xvmaxdp vs1, vs3, vs1
+; PWR10BE-NEXT:    xvmaxdp vs3, v2, v10
+; PWR10BE-NEXT:    xvmaxdp vs0, vs3, vs0
+; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmaxdp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs0, 256(r1)
+; PWR9LE-NEXT:    lxv vs1, 224(r1)
+; PWR9LE-NEXT:    lxv vs2, 272(r1)
+; PWR9LE-NEXT:    lxv vs3, 240(r1)
+; PWR9LE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR9LE-NEXT:    xvmaxdp vs5, v5, v13
+; PWR9LE-NEXT:    xvmaxdp vs6, v2, v10
+; PWR9LE-NEXT:    xvmaxdp vs7, v4, v12
+; PWR9LE-NEXT:    xvmaxdp vs3, v7, vs3
+; PWR9LE-NEXT:    xvmaxdp vs2, v9, vs2
+; PWR9LE-NEXT:    xvmaxdp vs1, v6, vs1
+; PWR9LE-NEXT:    xvmaxdp vs0, v8, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs7, vs0
+; PWR9LE-NEXT:    xvmaxdp vs1, vs6, vs1
+; PWR9LE-NEXT:    xvmaxdp vs2, vs5, vs2
+; PWR9LE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR9LE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs0, 272(r1)
+; PWR9BE-NEXT:    lxv vs1, 240(r1)
+; PWR9BE-NEXT:    lxv vs2, 288(r1)
+; PWR9BE-NEXT:    lxv vs3, 256(r1)
+; PWR9BE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR9BE-NEXT:    xvmaxdp vs5, v5, v13
+; PWR9BE-NEXT:    xvmaxdp vs6, v2, v10
+; PWR9BE-NEXT:    xvmaxdp vs7, v4, v12
+; PWR9BE-NEXT:    xvmaxdp vs3, v7, vs3
+; PWR9BE-NEXT:    xvmaxdp vs2, v9, vs2
+; PWR9BE-NEXT:    xvmaxdp vs1, v6, vs1
+; PWR9BE-NEXT:    xvmaxdp vs0, v8, vs0
+; PWR9BE-NEXT:    xvmaxdp vs0, vs7, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, vs6, vs1
+; PWR9BE-NEXT:    xvmaxdp vs2, vs5, vs2
+; PWR9BE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR9BE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs0, 256(r1)
+; PWR10LE-NEXT:    lxv vs1, 224(r1)
+; PWR10LE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR10LE-NEXT:    xvmaxdp vs5, v5, v13
+; PWR10LE-NEXT:    xvmaxdp vs6, v2, v10
+; PWR10LE-NEXT:    xvmaxdp vs7, v4, v12
+; PWR10LE-NEXT:    xvmaxdp vs1, v6, vs1
+; PWR10LE-NEXT:    lxv vs2, 272(r1)
+; PWR10LE-NEXT:    lxv vs3, 240(r1)
+; PWR10LE-NEXT:    xvmaxdp vs3, v7, vs3
+; PWR10LE-NEXT:    xvmaxdp vs2, v9, vs2
+; PWR10LE-NEXT:    xvmaxdp vs0, v8, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs7, vs0
+; PWR10LE-NEXT:    xvmaxdp vs1, vs6, vs1
+; PWR10LE-NEXT:    xvmaxdp vs2, vs5, vs2
+; PWR10LE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR10LE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs0, 272(r1)
+; PWR10BE-NEXT:    lxv vs1, 240(r1)
+; PWR10BE-NEXT:    xvmaxdp vs4, v3, v11
+; PWR10BE-NEXT:    xvmaxdp vs5, v5, v13
+; PWR10BE-NEXT:    xvmaxdp vs6, v2, v10
+; PWR10BE-NEXT:    xvmaxdp vs7, v4, v12
+; PWR10BE-NEXT:    xvmaxdp vs1, v6, vs1
+; PWR10BE-NEXT:    lxv vs2, 288(r1)
+; PWR10BE-NEXT:    lxv vs3, 256(r1)
+; PWR10BE-NEXT:    xvmaxdp vs3, v7, vs3
+; PWR10BE-NEXT:    xvmaxdp vs2, v9, vs2
+; PWR10BE-NEXT:    xvmaxdp vs0, v8, vs0
+; PWR10BE-NEXT:    xvmaxdp vs0, vs7, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, vs6, vs1
+; PWR10BE-NEXT:    xvmaxdp vs2, vs5, vs2
+; PWR10BE-NEXT:    xvmaxdp vs3, vs4, vs3
+; PWR10BE-NEXT:    xvmaxdp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a)
+  ret double %0
+}
+
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) #0
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) #0
+declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) #0
+declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) #0
+declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
@@ -0,0 +1,1169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of f32
+;;
+define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmindp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmindp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    xvminsp vs0, v2, vs0
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    xvminsp vs0, v2, vs0
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    xvminsp vs0, v2, vs0
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    xvminsp vs0, v2, vs0
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs2, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs3, v2
+; PWR9LE-NEXT:    xscvspdpn f0, v2
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f3, vs3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmindp f2, f2, f3
+; PWR9LE-NEXT:    xsmindp f1, f2, f1
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f3, v2
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmindp f2, f3, f2
+; PWR9BE-NEXT:    xsmindp f1, f2, f1
+; PWR9BE-NEXT:    xsmindp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs2, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs3, v2
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f0, v2
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xscvspdpn f3, vs3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmindp f2, f2, f3
+; PWR10LE-NEXT:    xsmindp f1, f2, f1
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs2, v2, v2, 1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f3, v2
+; PWR10BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmindp f2, f3, f2
+; PWR10BE-NEXT:    xsmindp f1, f2, f1
+; PWR10BE-NEXT:    xsmindp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xvminsp vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xvminsp vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xvminsp vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xvminsp vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f1, f2
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v2, v3
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xsmindp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f1, f2
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v2, v3
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xsmindp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v3, v5
+; PWR9LE-NEXT:    xvminsp vs1, v2, v4
+; PWR9LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f1, f2
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v3, v5
+; PWR9BE-NEXT:    xvminsp vs1, v2, v4
+; PWR9BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xsmindp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v3, v5
+; PWR10LE-NEXT:    xvminsp vs1, v2, v4
+; PWR10LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f1, f2
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v3, v5
+; PWR10BE-NEXT:    xvminsp vs1, v2, v4
+; PWR10BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xsmindp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v3, v5
+; PWR9LE-NEXT:    xvminsp vs1, v2, v4
+; PWR9LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v3, v5
+; PWR9BE-NEXT:    xvminsp vs1, v2, v4
+; PWR9BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v3, v5
+; PWR10LE-NEXT:    xvminsp vs1, v2, v4
+; PWR10LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v3, v5
+; PWR10BE-NEXT:    xvminsp vs1, v2, v4
+; PWR10BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v32f32(<32 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v5, v9
+; PWR9LE-NEXT:    xvminsp vs1, v3, v7
+; PWR9LE-NEXT:    xvminsp vs2, v2, v6
+; PWR9LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvminsp vs1, v4, v8
+; PWR9LE-NEXT:    xvminsp vs1, vs2, vs1
+; PWR9LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f2, f1
+; PWR9LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f2, vs2
+; PWR9LE-NEXT:    xsmindp f1, f1, f2
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v5, v9
+; PWR9BE-NEXT:    xvminsp vs1, v3, v7
+; PWR9BE-NEXT:    xvminsp vs2, v2, v6
+; PWR9BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvminsp vs1, v4, v8
+; PWR9BE-NEXT:    xvminsp vs1, vs2, vs1
+; PWR9BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xxswapd vs2, vs0
+; PWR9BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9BE-NEXT:    xscvspdpn f2, vs2
+; PWR9BE-NEXT:    xscvspdpn f0, vs0
+; PWR9BE-NEXT:    xsmindp f1, f1, f2
+; PWR9BE-NEXT:    xsmindp f1, f1, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v5, v9
+; PWR10LE-NEXT:    xvminsp vs1, v3, v7
+; PWR10LE-NEXT:    xvminsp vs2, v2, v6
+; PWR10LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvminsp vs1, v4, v8
+; PWR10LE-NEXT:    xvminsp vs1, vs2, vs1
+; PWR10LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f2, f1
+; PWR10LE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f2, vs2
+; PWR10LE-NEXT:    xsmindp f1, f1, f2
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v5, v9
+; PWR10BE-NEXT:    xvminsp vs1, v3, v7
+; PWR10BE-NEXT:    xvminsp vs2, v2, v6
+; PWR10BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvminsp vs1, v4, v8
+; PWR10BE-NEXT:    xvminsp vs1, vs2, vs1
+; PWR10BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxsldwi vs2, vs0, vs0, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xxswapd vs2, vs0
+; PWR10BE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10BE-NEXT:    xscvspdpn f2, vs2
+; PWR10BE-NEXT:    xscvspdpn f0, vs0
+; PWR10BE-NEXT:    xsmindp f1, f1, f2
+; PWR10BE-NEXT:    xsmindp f1, f1, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v32f32_fast(<32 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvminsp vs0, v4, v8
+; PWR9LE-NEXT:    xvminsp vs1, v2, v6
+; PWR9LE-NEXT:    xvminsp vs2, v5, v9
+; PWR9LE-NEXT:    xvminsp vs3, v3, v7
+; PWR9LE-NEXT:    xvminsp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvminsp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvminsp vs0, v4, v8
+; PWR9BE-NEXT:    xvminsp vs1, v2, v6
+; PWR9BE-NEXT:    xvminsp vs2, v5, v9
+; PWR9BE-NEXT:    xvminsp vs3, v3, v7
+; PWR9BE-NEXT:    xvminsp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvminsp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvminsp vs0, v4, v8
+; PWR10LE-NEXT:    xvminsp vs1, v2, v6
+; PWR10LE-NEXT:    xvminsp vs2, v5, v9
+; PWR10LE-NEXT:    xvminsp vs3, v3, v7
+; PWR10LE-NEXT:    xvminsp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvminsp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvminsp vs0, v4, v8
+; PWR10BE-NEXT:    xvminsp vs1, v2, v6
+; PWR10BE-NEXT:    xvminsp vs2, v5, v9
+; PWR10BE-NEXT:    xvminsp vs3, v3, v7
+; PWR10BE-NEXT:    xvminsp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvminsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvminsp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvminsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvminsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> %a)
+  ret float %0
+}
+
+declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) #0
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) #0
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) #0
+declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) #0
+declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) #0
+
+;;
+;; Vectors of f64
+;;
+define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmindp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xsmindp f1, v2, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmindp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xsmindp f1, v2, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xvmindp vs0, v2, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xvmindp vs1, v2, vs0
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xvmindp vs0, v2, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xvmindp vs1, v2, vs0
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmindp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmindp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v3, v5
+; PWR9LE-NEXT:    xvmindp vs1, v2, v4
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v3, v5
+; PWR9BE-NEXT:    xvmindp vs1, v2, v4
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmindp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v3, v5
+; PWR10LE-NEXT:    xvmindp vs1, v2, v4
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v3, v5
+; PWR10BE-NEXT:    xvmindp vs1, v2, v4
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmindp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v3, v5
+; PWR9LE-NEXT:    xvmindp vs1, v2, v4
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v3, v5
+; PWR9BE-NEXT:    xvmindp vs1, v2, v4
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v3, v5
+; PWR10LE-NEXT:    xvmindp vs1, v2, v4
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v3, v5
+; PWR10BE-NEXT:    xvmindp vs1, v2, v4
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v5, v9
+; PWR9LE-NEXT:    xvmindp vs1, v3, v7
+; PWR9LE-NEXT:    xvmindp vs2, v2, v6
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs1, v4, v8
+; PWR9LE-NEXT:    xvmindp vs1, vs2, vs1
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v5, v9
+; PWR9BE-NEXT:    xvmindp vs1, v3, v7
+; PWR9BE-NEXT:    xvmindp vs2, v2, v6
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs1, v4, v8
+; PWR9BE-NEXT:    xvmindp vs1, vs2, vs1
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmindp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v5, v9
+; PWR10LE-NEXT:    xvmindp vs1, v3, v7
+; PWR10LE-NEXT:    xvmindp vs2, v2, v6
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs1, v4, v8
+; PWR10LE-NEXT:    xvmindp vs1, vs2, vs1
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v5, v9
+; PWR10BE-NEXT:    xvmindp vs1, v3, v7
+; PWR10BE-NEXT:    xvmindp vs2, v2, v6
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs1, v4, v8
+; PWR10BE-NEXT:    xvmindp vs1, vs2, vs1
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmindp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmindp vs0, v4, v8
+; PWR9LE-NEXT:    xvmindp vs1, v2, v6
+; PWR9LE-NEXT:    xvmindp vs2, v5, v9
+; PWR9LE-NEXT:    xvmindp vs3, v3, v7
+; PWR9LE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmindp vs0, v4, v8
+; PWR9BE-NEXT:    xvmindp vs1, v2, v6
+; PWR9BE-NEXT:    xvmindp vs2, v5, v9
+; PWR9BE-NEXT:    xvmindp vs3, v3, v7
+; PWR9BE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmindp vs0, v4, v8
+; PWR10LE-NEXT:    xvmindp vs1, v2, v6
+; PWR10LE-NEXT:    xvmindp vs2, v5, v9
+; PWR10LE-NEXT:    xvmindp vs3, v3, v7
+; PWR10LE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmindp vs0, v4, v8
+; PWR10BE-NEXT:    xvmindp vs1, v2, v6
+; PWR10BE-NEXT:    xvmindp vs2, v5, v9
+; PWR10BE-NEXT:    xvmindp vs3, v3, v7
+; PWR10BE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs3, 272(r1)
+; PWR9LE-NEXT:    lxv vs2, 240(r1)
+; PWR9LE-NEXT:    xvmindp vs4, v5, v13
+; PWR9LE-NEXT:    lxv vs1, 256(r1)
+; PWR9LE-NEXT:    lxv vs0, 224(r1)
+; PWR9LE-NEXT:    xvmindp vs3, v9, vs3
+; PWR9LE-NEXT:    xvmindp vs2, v7, vs2
+; PWR9LE-NEXT:    xvmindp vs1, v8, vs1
+; PWR9LE-NEXT:    xvmindp vs0, v6, vs0
+; PWR9LE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR9LE-NEXT:    xvmindp vs4, v3, v11
+; PWR9LE-NEXT:    xvmindp vs2, vs4, vs2
+; PWR9LE-NEXT:    xvmindp vs2, vs2, vs3
+; PWR9LE-NEXT:    xvmindp vs3, v4, v12
+; PWR9LE-NEXT:    xvmindp vs1, vs3, vs1
+; PWR9LE-NEXT:    xvmindp vs3, v2, v10
+; PWR9LE-NEXT:    xvmindp vs0, vs3, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xsmindp f1, f1, f0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs3, 288(r1)
+; PWR9BE-NEXT:    lxv vs2, 256(r1)
+; PWR9BE-NEXT:    xvmindp vs4, v5, v13
+; PWR9BE-NEXT:    lxv vs1, 272(r1)
+; PWR9BE-NEXT:    lxv vs0, 240(r1)
+; PWR9BE-NEXT:    xvmindp vs3, v9, vs3
+; PWR9BE-NEXT:    xvmindp vs2, v7, vs2
+; PWR9BE-NEXT:    xvmindp vs1, v8, vs1
+; PWR9BE-NEXT:    xvmindp vs0, v6, vs0
+; PWR9BE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR9BE-NEXT:    xvmindp vs4, v3, v11
+; PWR9BE-NEXT:    xvmindp vs2, vs4, vs2
+; PWR9BE-NEXT:    xvmindp vs2, vs2, vs3
+; PWR9BE-NEXT:    xvmindp vs3, v4, v12
+; PWR9BE-NEXT:    xvmindp vs1, vs3, vs1
+; PWR9BE-NEXT:    xvmindp vs3, v2, v10
+; PWR9BE-NEXT:    xvmindp vs0, vs3, vs0
+; PWR9BE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xsmindp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs3, 272(r1)
+; PWR10LE-NEXT:    lxv vs2, 240(r1)
+; PWR10LE-NEXT:    xvmindp vs4, v5, v13
+; PWR10LE-NEXT:    xvmindp vs3, v9, vs3
+; PWR10LE-NEXT:    lxv vs1, 256(r1)
+; PWR10LE-NEXT:    xvmindp vs2, v7, vs2
+; PWR10LE-NEXT:    lxv vs0, 224(r1)
+; PWR10LE-NEXT:    xvmindp vs1, v8, vs1
+; PWR10LE-NEXT:    xvmindp vs0, v6, vs0
+; PWR10LE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR10LE-NEXT:    xvmindp vs4, v3, v11
+; PWR10LE-NEXT:    xvmindp vs2, vs4, vs2
+; PWR10LE-NEXT:    xvmindp vs2, vs2, vs3
+; PWR10LE-NEXT:    xvmindp vs3, v4, v12
+; PWR10LE-NEXT:    xvmindp vs1, vs3, vs1
+; PWR10LE-NEXT:    xvmindp vs3, v2, v10
+; PWR10LE-NEXT:    xvmindp vs0, vs3, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xsmindp f1, f1, f0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs3, 288(r1)
+; PWR10BE-NEXT:    lxv vs2, 256(r1)
+; PWR10BE-NEXT:    xvmindp vs4, v5, v13
+; PWR10BE-NEXT:    xvmindp vs3, v9, vs3
+; PWR10BE-NEXT:    lxv vs1, 272(r1)
+; PWR10BE-NEXT:    xvmindp vs2, v7, vs2
+; PWR10BE-NEXT:    lxv vs0, 240(r1)
+; PWR10BE-NEXT:    xvmindp vs1, v8, vs1
+; PWR10BE-NEXT:    xvmindp vs0, v6, vs0
+; PWR10BE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR10BE-NEXT:    xvmindp vs4, v3, v11
+; PWR10BE-NEXT:    xvmindp vs2, vs4, vs2
+; PWR10BE-NEXT:    xvmindp vs2, vs2, vs3
+; PWR10BE-NEXT:    xvmindp vs3, v4, v12
+; PWR10BE-NEXT:    xvmindp vs1, vs3, vs1
+; PWR10BE-NEXT:    xvmindp vs3, v2, v10
+; PWR10BE-NEXT:    xvmindp vs0, vs3, vs0
+; PWR10BE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xsmindp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v32f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    lxv vs0, 256(r1)
+; PWR9LE-NEXT:    lxv vs1, 224(r1)
+; PWR9LE-NEXT:    lxv vs2, 272(r1)
+; PWR9LE-NEXT:    lxv vs3, 240(r1)
+; PWR9LE-NEXT:    xvmindp vs4, v3, v11
+; PWR9LE-NEXT:    xvmindp vs5, v5, v13
+; PWR9LE-NEXT:    xvmindp vs6, v2, v10
+; PWR9LE-NEXT:    xvmindp vs7, v4, v12
+; PWR9LE-NEXT:    xvmindp vs3, v7, vs3
+; PWR9LE-NEXT:    xvmindp vs2, v9, vs2
+; PWR9LE-NEXT:    xvmindp vs1, v6, vs1
+; PWR9LE-NEXT:    xvmindp vs0, v8, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs7, vs0
+; PWR9LE-NEXT:    xvmindp vs1, vs6, vs1
+; PWR9LE-NEXT:    xvmindp vs2, vs5, vs2
+; PWR9LE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR9LE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v32f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    lxv vs0, 272(r1)
+; PWR9BE-NEXT:    lxv vs1, 240(r1)
+; PWR9BE-NEXT:    lxv vs2, 288(r1)
+; PWR9BE-NEXT:    lxv vs3, 256(r1)
+; PWR9BE-NEXT:    xvmindp vs4, v3, v11
+; PWR9BE-NEXT:    xvmindp vs5, v5, v13
+; PWR9BE-NEXT:    xvmindp vs6, v2, v10
+; PWR9BE-NEXT:    xvmindp vs7, v4, v12
+; PWR9BE-NEXT:    xvmindp vs3, v7, vs3
+; PWR9BE-NEXT:    xvmindp vs2, v9, vs2
+; PWR9BE-NEXT:    xvmindp vs1, v6, vs1
+; PWR9BE-NEXT:    xvmindp vs0, v8, vs0
+; PWR9BE-NEXT:    xvmindp vs0, vs7, vs0
+; PWR9BE-NEXT:    xvmindp vs1, vs6, vs1
+; PWR9BE-NEXT:    xvmindp vs2, vs5, vs2
+; PWR9BE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR9BE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v32f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    lxv vs0, 256(r1)
+; PWR10LE-NEXT:    lxv vs1, 224(r1)
+; PWR10LE-NEXT:    xvmindp vs4, v3, v11
+; PWR10LE-NEXT:    xvmindp vs5, v5, v13
+; PWR10LE-NEXT:    xvmindp vs6, v2, v10
+; PWR10LE-NEXT:    xvmindp vs7, v4, v12
+; PWR10LE-NEXT:    xvmindp vs1, v6, vs1
+; PWR10LE-NEXT:    lxv vs2, 272(r1)
+; PWR10LE-NEXT:    lxv vs3, 240(r1)
+; PWR10LE-NEXT:    xvmindp vs3, v7, vs3
+; PWR10LE-NEXT:    xvmindp vs2, v9, vs2
+; PWR10LE-NEXT:    xvmindp vs0, v8, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs7, vs0
+; PWR10LE-NEXT:    xvmindp vs1, vs6, vs1
+; PWR10LE-NEXT:    xvmindp vs2, vs5, vs2
+; PWR10LE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR10LE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v32f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    lxv vs0, 272(r1)
+; PWR10BE-NEXT:    lxv vs1, 240(r1)
+; PWR10BE-NEXT:    xvmindp vs4, v3, v11
+; PWR10BE-NEXT:    xvmindp vs5, v5, v13
+; PWR10BE-NEXT:    xvmindp vs6, v2, v10
+; PWR10BE-NEXT:    xvmindp vs7, v4, v12
+; PWR10BE-NEXT:    xvmindp vs1, v6, vs1
+; PWR10BE-NEXT:    lxv vs2, 288(r1)
+; PWR10BE-NEXT:    lxv vs3, 256(r1)
+; PWR10BE-NEXT:    xvmindp vs3, v7, vs3
+; PWR10BE-NEXT:    xvmindp vs2, v9, vs2
+; PWR10BE-NEXT:    xvmindp vs0, v8, vs0
+; PWR10BE-NEXT:    xvmindp vs0, vs7, vs0
+; PWR10BE-NEXT:    xvmindp vs1, vs6, vs1
+; PWR10BE-NEXT:    xvmindp vs2, vs5, vs2
+; PWR10BE-NEXT:    xvmindp vs3, vs4, vs3
+; PWR10BE-NEXT:    xvmindp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a)
+  ret double %0
+}
+
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) #0
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) #0
+declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) #0
+declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) #0
+declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
@@ -0,0 +1,1717 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \
+; RUN:   FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of f32
+;;
+define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v2f32(float 1.000000e+00, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_b(<2 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsmulsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsmulsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsmulsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsmulsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v2f32(float %b, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    xvmulsp vs0, v2, vs0
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    xvmulsp vs0, v2, vs0
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    xvmulsp vs0, v2, vs0
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    xvmulsp vs0, v2, vs0
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmul.v2f32(float 1.000000e+00, <2 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_b(<4 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsmulsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsmulsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsmulsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsmulsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v4f32(float %b, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xvmulsp vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xvmulsp vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xvmulsp vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xvmulsp vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_b(<8 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsmulsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsmulsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsmulsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsmulsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v8f32(float %b, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmulsp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmulsp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmulsp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmulsp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v4
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v5
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v4
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v5
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v4
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v5
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v4
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v5
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_b(<16 x float> %a, float %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR9LE-NEXT:    xscvspdpn f0, vs0
+; PWR9LE-NEXT:    xsmulsp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v2
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v2
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v3
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v4
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9LE-NEXT:    xscvspdpn f1, vs1
+; PWR9LE-NEXT:    xsmulsp f0, f0, f1
+; PWR9LE-NEXT:    xscvspdpn f1, v5
+; PWR9LE-NEXT:    xsmulsp f1, f0, f1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xscvspdpn f0, v2
+; PWR9BE-NEXT:    xsmulsp f0, f1, f0
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v3
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v4
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xscvspdpn f1, v5
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f0, f0, f1
+; PWR9BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR9BE-NEXT:    xscvspdpn f1, vs1
+; PWR9BE-NEXT:    xsmulsp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxsldwi vs0, v2, v2, 3
+; PWR10LE-NEXT:    xscvspdpn f0, vs0
+; PWR10LE-NEXT:    xsmulsp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v2
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v2
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v3
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v4
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10LE-NEXT:    xscvspdpn f1, vs1
+; PWR10LE-NEXT:    xsmulsp f0, f0, f1
+; PWR10LE-NEXT:    xscvspdpn f1, v5
+; PWR10LE-NEXT:    xsmulsp f1, f0, f1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xscvspdpn f0, v2
+; PWR10BE-NEXT:    xsmulsp f0, f1, f0
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v2, v2, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v3
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v3, v3, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v4
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v4, v4, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xscvspdpn f1, v5
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 1
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f0, f0, f1
+; PWR10BE-NEXT:    xxsldwi vs1, v5, v5, 3
+; PWR10BE-NEXT:    xscvspdpn f1, vs1
+; PWR10BE-NEXT:    xsmulsp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call float @llvm.vector.reduce.fmul.v16f32(float %b, <16 x float> %a)
+  ret float %0
+}
+
+define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f32_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmulsp vs0, v3, v5
+; PWR9LE-NEXT:    xvmulsp vs1, v2, v4
+; PWR9LE-NEXT:    xvmulsp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR9LE-NEXT:    xscvspdpn f1, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f32_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmulsp vs0, v3, v5
+; PWR9BE-NEXT:    xvmulsp vs1, v2, v4
+; PWR9BE-NEXT:    xvmulsp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR9BE-NEXT:    xscvspdpn f1, vs0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f32_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmulsp vs0, v3, v5
+; PWR10LE-NEXT:    xvmulsp vs1, v2, v4
+; PWR10LE-NEXT:    xvmulsp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxsldwi vs0, vs0, vs0, 3
+; PWR10LE-NEXT:    xscvspdpn f1, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f32_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmulsp vs0, v3, v5
+; PWR10BE-NEXT:    xvmulsp vs1, v2, v4
+; PWR10BE-NEXT:    xvmulsp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xvmulsp vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xvmulsp vs0, vs0, vs1
+; PWR10BE-NEXT:    xscvspdpn f1, vs0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %a)
+  ret float %0
+}
+
+declare float @llvm.vector.reduce.fmul.v2f32(float, <2 x float>) #0
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>) #0
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>) #0
+declare float @llvm.vector.reduce.fmul.v16f32(float, <16 x float>) #0
+
+;;
+;; Vectors of f64
+;;
+define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmuldp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xsmuldp f1, v2, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmuldp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xsmuldp f1, v2, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_b(<2 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f1, f0
+; PWR9LE-NEXT:    xsmuldp f1, f0, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsmuldp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f1, f0
+; PWR10LE-NEXT:    xsmuldp f1, f0, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsmuldp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v2f64(double %b, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xvmuldp vs0, v2, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xvmuldp vs1, v2, vs0
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xvmuldp vs0, v2, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xvmuldp vs1, v2, vs0
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, v2, f0
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, v2, f0
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_b(<4 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v3
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsmuldp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v3
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsmuldp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v4f64(double %b, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmuldp vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmuldp vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmuldp vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmuldp vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v5
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, v2, f0
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, v5
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v5
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, v2, f0
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, v5
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_b(<8 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v5
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsmuldp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, v5
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v5
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsmuldp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, v5
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v8f64(double %b, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmuldp vs0, v3, v5
+; PWR9LE-NEXT:    xvmuldp vs1, v2, v4
+; PWR9LE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmuldp vs0, v3, v5
+; PWR9BE-NEXT:    xvmuldp vs1, v2, v4
+; PWR9BE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmuldp vs0, v3, v5
+; PWR10LE-NEXT:    xvmuldp vs1, v2, v4
+; PWR10LE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmuldp vs0, v3, v5
+; PWR10BE-NEXT:    xvmuldp vs1, v2, v4
+; PWR10BE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v6
+; PWR9LE-NEXT:    xsmuldp f0, f0, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v7
+; PWR9LE-NEXT:    xsmuldp f0, f0, v6
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v8
+; PWR9LE-NEXT:    xsmuldp f0, f0, v7
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v9
+; PWR9LE-NEXT:    xsmuldp f0, f0, v8
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v9
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd vs0, v2
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, v2, f0
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v6
+; PWR9BE-NEXT:    xsmuldp f0, f0, v6
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v7
+; PWR9BE-NEXT:    xsmuldp f0, f0, v7
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v8
+; PWR9BE-NEXT:    xsmuldp f0, f0, v8
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v9
+; PWR9BE-NEXT:    xsmuldp f0, f0, v9
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v6
+; PWR10LE-NEXT:    xsmuldp f0, f0, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v7
+; PWR10LE-NEXT:    xsmuldp f0, f0, v6
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v8
+; PWR10LE-NEXT:    xsmuldp f0, f0, v7
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v9
+; PWR10LE-NEXT:    xsmuldp f0, f0, v8
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v9
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd vs0, v2
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, v2, f0
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v6
+; PWR10BE-NEXT:    xsmuldp f0, f0, v6
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v7
+; PWR10BE-NEXT:    xsmuldp f0, f0, v7
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v8
+; PWR10BE-NEXT:    xsmuldp f0, f0, v8
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v9
+; PWR10BE-NEXT:    xsmuldp f0, f0, v9
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_b(<16 x double> %a, double %b) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_b:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd vs0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f1, f0
+; PWR9LE-NEXT:    xxswapd vs1, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, v2
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, v3
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, v4
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v6
+; PWR9LE-NEXT:    xsmuldp f0, f0, v5
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v7
+; PWR9LE-NEXT:    xsmuldp f0, f0, v6
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v8
+; PWR9LE-NEXT:    xsmuldp f0, f0, v7
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xxswapd vs1, v9
+; PWR9LE-NEXT:    xsmuldp f0, f0, v8
+; PWR9LE-NEXT:    xsmuldp f0, f0, f1
+; PWR9LE-NEXT:    xsmuldp f1, f0, v9
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_b:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xsmuldp f0, f1, v2
+; PWR9BE-NEXT:    xxswapd vs1, v2
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, v3
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, v4
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, v5
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v6
+; PWR9BE-NEXT:    xsmuldp f0, f0, v6
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v7
+; PWR9BE-NEXT:    xsmuldp f0, f0, v7
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v8
+; PWR9BE-NEXT:    xsmuldp f0, f0, v8
+; PWR9BE-NEXT:    xsmuldp f0, f0, f1
+; PWR9BE-NEXT:    xxswapd vs1, v9
+; PWR9BE-NEXT:    xsmuldp f0, f0, v9
+; PWR9BE-NEXT:    xsmuldp f1, f0, f1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_b:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd vs0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f1, f0
+; PWR10LE-NEXT:    xxswapd vs1, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, v2
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, v3
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, v4
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v6
+; PWR10LE-NEXT:    xsmuldp f0, f0, v5
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v7
+; PWR10LE-NEXT:    xsmuldp f0, f0, v6
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v8
+; PWR10LE-NEXT:    xsmuldp f0, f0, v7
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xxswapd vs1, v9
+; PWR10LE-NEXT:    xsmuldp f0, f0, v8
+; PWR10LE-NEXT:    xsmuldp f0, f0, f1
+; PWR10LE-NEXT:    xsmuldp f1, f0, v9
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_b:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xsmuldp f0, f1, v2
+; PWR10BE-NEXT:    xxswapd vs1, v2
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, v3
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, v4
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, v5
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v6
+; PWR10BE-NEXT:    xsmuldp f0, f0, v6
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v7
+; PWR10BE-NEXT:    xsmuldp f0, f0, v7
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v8
+; PWR10BE-NEXT:    xsmuldp f0, f0, v8
+; PWR10BE-NEXT:    xsmuldp f0, f0, f1
+; PWR10BE-NEXT:    xxswapd vs1, v9
+; PWR10BE-NEXT:    xsmuldp f0, f0, v9
+; PWR10BE-NEXT:    xsmuldp f1, f0, f1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call double @llvm.vector.reduce.fmul.v16f64(double %b, <16 x double> %a)
+  ret double %0
+}
+
+define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16f64_fast:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xvmuldp vs0, v4, v8
+; PWR9LE-NEXT:    xvmuldp vs1, v2, v6
+; PWR9LE-NEXT:    xvmuldp vs2, v5, v9
+; PWR9LE-NEXT:    xvmuldp vs3, v3, v7
+; PWR9LE-NEXT:    xvmuldp vs2, vs3, vs2
+; PWR9LE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR9LE-NEXT:    xxswapd vs1, vs0
+; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16f64_fast:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xvmuldp vs0, v4, v8
+; PWR9BE-NEXT:    xvmuldp vs1, v2, v6
+; PWR9BE-NEXT:    xvmuldp vs2, v5, v9
+; PWR9BE-NEXT:    xvmuldp vs3, v3, v7
+; PWR9BE-NEXT:    xvmuldp vs2, vs3, vs2
+; PWR9BE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR9BE-NEXT:    xvmuldp vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd vs1, vs0
+; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16f64_fast:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xvmuldp vs0, v4, v8
+; PWR10LE-NEXT:    xvmuldp vs1, v2, v6
+; PWR10LE-NEXT:    xvmuldp vs2, v5, v9
+; PWR10LE-NEXT:    xvmuldp vs3, v3, v7
+; PWR10LE-NEXT:    xvmuldp vs2, vs3, vs2
+; PWR10LE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
+; PWR10LE-NEXT:    xxswapd vs1, vs0
+; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16f64_fast:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xvmuldp vs0, v4, v8
+; PWR10BE-NEXT:    xvmuldp vs1, v2, v6
+; PWR10BE-NEXT:    xvmuldp vs2, v5, v9
+; PWR10BE-NEXT:    xvmuldp vs3, v3, v7
+; PWR10BE-NEXT:    xvmuldp vs2, vs3, vs2
+; PWR10BE-NEXT:    xvmuldp vs0, vs1, vs0
+; PWR10BE-NEXT:    xvmuldp vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd vs1, vs0
+; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
+; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call fast double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a)
+  ret double %0
+}
+
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>) #0
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) #0
+declare double @llvm.vector.reduce.fmul.v8f64(double, <8 x double>) #0
+declare double @llvm.vector.reduce.fmul.v16f64(double, <16 x double>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmuluwm v3, v3, v5
+; PWR9LE-NEXT:    vmuluwm v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmuluwm v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmuluwm v3, v3, v5
+; PWR9BE-NEXT:    vmuluwm v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmuluwm v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmuluwm v3, v3, v5
+; PWR10LE-NEXT:    vmuluwm v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmuluwm v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmuluwm v3, v3, v5
+; PWR10BE-NEXT:    vmuluwm v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmuluwm v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlor v2, v2, vs0
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlor v2, v2, vs0
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlor v2, v2, vs0
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlor v2, v2, vs0
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlor vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlor vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlor vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlor vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlor vs0, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlor vs0, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlor vs0, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlor vs0, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlor vs0, v3, v5
+; PWR9LE-NEXT:    xxlor vs1, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlor vs0, v3, v5
+; PWR9BE-NEXT:    xxlor vs1, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlor vs0, v3, v5
+; PWR10LE-NEXT:    xxlor vs1, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlor vs0, v3, v5
+; PWR10BE-NEXT:    xxlor vs1, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xxlor vs0, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xxlor vs0, v2, v3
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xxlor vs0, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xxlor vs0, v2, v3
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlor vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlor vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlor vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlor vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlor vs0, v3, v5
+; PWR9LE-NEXT:    xxlor vs1, v2, v4
+; PWR9LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlor vs0, v3, v5
+; PWR9BE-NEXT:    xxlor vs1, v2, v4
+; PWR9BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlor vs0, v3, v5
+; PWR10LE-NEXT:    xxlor vs1, v2, v4
+; PWR10LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlor vs0, v3, v5
+; PWR10BE-NEXT:    xxlor vs1, v2, v4
+; PWR10BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlor vs0, v4, v8
+; PWR9LE-NEXT:    xxlor vs1, v2, v6
+; PWR9LE-NEXT:    xxlor vs2, v5, v9
+; PWR9LE-NEXT:    xxlor vs3, v3, v7
+; PWR9LE-NEXT:    xxlor vs2, vs3, vs2
+; PWR9LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlor vs0, v4, v8
+; PWR9BE-NEXT:    xxlor vs1, v2, v6
+; PWR9BE-NEXT:    xxlor vs2, v5, v9
+; PWR9BE-NEXT:    xxlor vs3, v3, v7
+; PWR9BE-NEXT:    xxlor vs2, vs3, vs2
+; PWR9BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlor vs0, v4, v8
+; PWR10LE-NEXT:    xxlor vs1, v2, v6
+; PWR10LE-NEXT:    xxlor vs2, v5, v9
+; PWR10LE-NEXT:    xxlor vs3, v3, v7
+; PWR10LE-NEXT:    xxlor vs2, vs3, vs2
+; PWR10LE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlor vs0, v4, v8
+; PWR10BE-NEXT:    xxlor vs1, v2, v6
+; PWR10BE-NEXT:    xxlor vs2, v5, v9
+; PWR10BE-NEXT:    xxlor vs3, v3, v7
+; PWR10BE-NEXT:    xxlor vs2, vs3, vs2
+; PWR10BE-NEXT:    xxlor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i8
+;;
+define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) #0
+declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) #0
+declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) #0
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) #0
+
+;;
+;; Vectors of type i16
+;;
+define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a)
+  ret i16 %0
+}
+
+declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) #0
+declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) #0
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) #0
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) #0
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsw v3, v3, v5
+; PWR9LE-NEXT:    vmaxsw v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsw v3, v3, v5
+; PWR9BE-NEXT:    vmaxsw v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsw v3, v3, v5
+; PWR10LE-NEXT:    vmaxsw v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsw v3, v3, v5
+; PWR10BE-NEXT:    vmaxsw v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsd v2, v2, v4
+; PWR9LE-NEXT:    vmaxsd v3, v3, v5
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsd v2, v2, v4
+; PWR9BE-NEXT:    vmaxsd v3, v3, v5
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsd v2, v2, v4
+; PWR10LE-NEXT:    vmaxsd v3, v3, v5
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsd v2, v2, v4
+; PWR10BE-NEXT:    vmaxsd v3, v3, v5
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxsd v3, v3, v7
+; PWR9LE-NEXT:    vmaxsd v5, v5, v9
+; PWR9LE-NEXT:    vmaxsd v2, v2, v6
+; PWR9LE-NEXT:    vmaxsd v4, v4, v8
+; PWR9LE-NEXT:    vmaxsd v2, v2, v4
+; PWR9LE-NEXT:    vmaxsd v3, v3, v5
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxsd v3, v3, v7
+; PWR9BE-NEXT:    vmaxsd v5, v5, v9
+; PWR9BE-NEXT:    vmaxsd v2, v2, v6
+; PWR9BE-NEXT:    vmaxsd v4, v4, v8
+; PWR9BE-NEXT:    vmaxsd v2, v2, v4
+; PWR9BE-NEXT:    vmaxsd v3, v3, v5
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxsd v3, v3, v7
+; PWR10LE-NEXT:    vmaxsd v5, v5, v9
+; PWR10LE-NEXT:    vmaxsd v2, v2, v6
+; PWR10LE-NEXT:    vmaxsd v4, v4, v8
+; PWR10LE-NEXT:    vmaxsd v2, v2, v4
+; PWR10LE-NEXT:    vmaxsd v3, v3, v5
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxsd v3, v3, v7
+; PWR10BE-NEXT:    vmaxsd v5, v5, v9
+; PWR10BE-NEXT:    vmaxsd v2, v2, v6
+; PWR10BE-NEXT:    vmaxsd v4, v4, v8
+; PWR10BE-NEXT:    vmaxsd v2, v2, v4
+; PWR10BE-NEXT:    vmaxsd v3, v3, v5
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i8
+;;
+define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminsb v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminsb v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminsb v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminsb v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) #0
+declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) #0
+declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) #0
+declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) #0
+
+;;
+;; Vectors of type i16
+;;
+define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminsh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminsh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminsh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminsh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a)
+  ret i16 %0
+}
+
+declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) #0
+declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) #0
+declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) #0
+declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) #0
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsw v3, v3, v5
+; PWR9LE-NEXT:    vminsw v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminsw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsw v3, v3, v5
+; PWR9BE-NEXT:    vminsw v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminsw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsw v3, v3, v5
+; PWR10LE-NEXT:    vminsw v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminsw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsw v3, v3, v5
+; PWR10BE-NEXT:    vminsw v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminsw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsd v2, v2, v4
+; PWR9LE-NEXT:    vminsd v3, v3, v5
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsd v2, v2, v4
+; PWR9BE-NEXT:    vminsd v3, v3, v5
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsd v2, v2, v4
+; PWR10LE-NEXT:    vminsd v3, v3, v5
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsd v2, v2, v4
+; PWR10BE-NEXT:    vminsd v3, v3, v5
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminsd v3, v3, v7
+; PWR9LE-NEXT:    vminsd v5, v5, v9
+; PWR9LE-NEXT:    vminsd v2, v2, v6
+; PWR9LE-NEXT:    vminsd v4, v4, v8
+; PWR9LE-NEXT:    vminsd v2, v2, v4
+; PWR9LE-NEXT:    vminsd v3, v3, v5
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminsd v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminsd v3, v3, v7
+; PWR9BE-NEXT:    vminsd v5, v5, v9
+; PWR9BE-NEXT:    vminsd v2, v2, v6
+; PWR9BE-NEXT:    vminsd v4, v4, v8
+; PWR9BE-NEXT:    vminsd v2, v2, v4
+; PWR9BE-NEXT:    vminsd v3, v3, v5
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminsd v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminsd v3, v3, v7
+; PWR10LE-NEXT:    vminsd v5, v5, v9
+; PWR10LE-NEXT:    vminsd v2, v2, v6
+; PWR10LE-NEXT:    vminsd v4, v4, v8
+; PWR10LE-NEXT:    vminsd v2, v2, v4
+; PWR10LE-NEXT:    vminsd v3, v3, v5
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminsd v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminsd v3, v3, v7
+; PWR10BE-NEXT:    vminsd v5, v5, v9
+; PWR10BE-NEXT:    vminsd v2, v2, v6
+; PWR10BE-NEXT:    vminsd v4, v4, v8
+; PWR10BE-NEXT:    vminsd v2, v2, v4
+; PWR10BE-NEXT:    vminsd v3, v3, v5
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminsd v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i8
+;;
+define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vmaxub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vmaxub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vmaxub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vmaxub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) #0
+declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) #0
+declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) #0
+declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) #0
+
+;;
+;; Vectors of type i16
+;;
+define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vmaxuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vmaxuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vmaxuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vmaxuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a)
+  ret i16 %0
+}
+
+declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) #0
+declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) #0
+declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) #0
+declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) #0
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxuw v3, v3, v5
+; PWR9LE-NEXT:    vmaxuw v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vmaxuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxuw v3, v3, v5
+; PWR9BE-NEXT:    vmaxuw v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vmaxuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxuw v3, v3, v5
+; PWR10LE-NEXT:    vmaxuw v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vmaxuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxuw v3, v3, v5
+; PWR10BE-NEXT:    vmaxuw v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vmaxuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxud v2, v2, v4
+; PWR9LE-NEXT:    vmaxud v3, v3, v5
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxud v2, v2, v4
+; PWR9BE-NEXT:    vmaxud v3, v3, v5
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxud v2, v2, v4
+; PWR10LE-NEXT:    vmaxud v3, v3, v5
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxud v2, v2, v4
+; PWR10BE-NEXT:    vmaxud v3, v3, v5
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vmaxud v3, v3, v7
+; PWR9LE-NEXT:    vmaxud v5, v5, v9
+; PWR9LE-NEXT:    vmaxud v2, v2, v6
+; PWR9LE-NEXT:    vmaxud v4, v4, v8
+; PWR9LE-NEXT:    vmaxud v2, v2, v4
+; PWR9LE-NEXT:    vmaxud v3, v3, v5
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vmaxud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vmaxud v3, v3, v7
+; PWR9BE-NEXT:    vmaxud v5, v5, v9
+; PWR9BE-NEXT:    vmaxud v2, v2, v6
+; PWR9BE-NEXT:    vmaxud v4, v4, v8
+; PWR9BE-NEXT:    vmaxud v2, v2, v4
+; PWR9BE-NEXT:    vmaxud v3, v3, v5
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vmaxud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vmaxud v3, v3, v7
+; PWR10LE-NEXT:    vmaxud v5, v5, v9
+; PWR10LE-NEXT:    vmaxud v2, v2, v6
+; PWR10LE-NEXT:    vmaxud v4, v4, v8
+; PWR10LE-NEXT:    vmaxud v2, v2, v4
+; PWR10LE-NEXT:    vmaxud v3, v3, v5
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vmaxud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vmaxud v3, v3, v7
+; PWR10BE-NEXT:    vmaxud v5, v5, v9
+; PWR10BE-NEXT:    vmaxud v2, v2, v6
+; PWR10BE-NEXT:    vmaxud v4, v4, v8
+; PWR10BE-NEXT:    vmaxud v2, v2, v4
+; PWR10BE-NEXT:    vmaxud v3, v3, v5
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vmaxud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i8
+;;
+define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
+  ret i8 %0
+}
+
+define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i8:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vspltb v3, v2, 14
+; PWR9LE-NEXT:    vminub v2, v2, v3
+; PWR9LE-NEXT:    vextubrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i8:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vspltb v3, v2, 1
+; PWR9BE-NEXT:    vminub v2, v2, v3
+; PWR9BE-NEXT:    vextublx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i8:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vspltb v3, v2, 14
+; PWR10LE-NEXT:    vminub v2, v2, v3
+; PWR10LE-NEXT:    vextubrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i8:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vspltb v3, v2, 1
+; PWR10BE-NEXT:    vminub v2, v2, v3
+; PWR10BE-NEXT:    vextublx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
+  ret i8 %0
+}
+
+declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) #0
+declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) #0
+declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) #0
+declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) #0
+
+;;
+;; Vectors of type i16
+;;
+define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
+  ret i16 %0
+}
+
+define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i16:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vsplth v3, v2, 6
+; PWR9LE-NEXT:    vminuh v2, v2, v3
+; PWR9LE-NEXT:    vextuhrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i16:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vsplth v3, v2, 1
+; PWR9BE-NEXT:    vminuh v2, v2, v3
+; PWR9BE-NEXT:    vextuhlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i16:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vsplth v3, v2, 6
+; PWR10LE-NEXT:    vminuh v2, v2, v3
+; PWR10LE-NEXT:    vextuhrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i16:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vsplth v3, v2, 1
+; PWR10BE-NEXT:    vminuh v2, v2, v3
+; PWR10BE-NEXT:    vextuhlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %a)
+  ret i16 %0
+}
+
+declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) #0
+declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) #0
+declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) #0
+declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) #0
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminuw v3, v3, v5
+; PWR9LE-NEXT:    vminuw v2, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    xxspltw v3, v2, 2
+; PWR9LE-NEXT:    vminuw v2, v2, v3
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminuw v3, v3, v5
+; PWR9BE-NEXT:    vminuw v2, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    xxspltw v3, v2, 1
+; PWR9BE-NEXT:    vminuw v2, v2, v3
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminuw v3, v3, v5
+; PWR10LE-NEXT:    vminuw v2, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    xxspltw v3, v2, 2
+; PWR10LE-NEXT:    vminuw v2, v2, v3
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminuw v3, v3, v5
+; PWR10BE-NEXT:    vminuw v2, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    xxspltw v3, v2, 1
+; PWR10BE-NEXT:    vminuw v2, v2, v3
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminud v2, v2, v4
+; PWR9LE-NEXT:    vminud v3, v3, v5
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminud v2, v2, v4
+; PWR9BE-NEXT:    vminud v3, v3, v5
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminud v2, v2, v4
+; PWR10LE-NEXT:    vminud v3, v3, v5
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminud v2, v2, v4
+; PWR10BE-NEXT:    vminud v3, v3, v5
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    vminud v3, v3, v7
+; PWR9LE-NEXT:    vminud v5, v5, v9
+; PWR9LE-NEXT:    vminud v2, v2, v6
+; PWR9LE-NEXT:    vminud v4, v4, v8
+; PWR9LE-NEXT:    vminud v2, v2, v4
+; PWR9LE-NEXT:    vminud v3, v3, v5
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    vminud v2, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    vminud v3, v3, v7
+; PWR9BE-NEXT:    vminud v5, v5, v9
+; PWR9BE-NEXT:    vminud v2, v2, v6
+; PWR9BE-NEXT:    vminud v4, v4, v8
+; PWR9BE-NEXT:    vminud v2, v2, v4
+; PWR9BE-NEXT:    vminud v3, v3, v5
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    vminud v2, v2, v3
+; PWR9BE-NEXT:    mfvsrd r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    vminud v3, v3, v7
+; PWR10LE-NEXT:    vminud v5, v5, v9
+; PWR10LE-NEXT:    vminud v2, v2, v6
+; PWR10LE-NEXT:    vminud v4, v4, v8
+; PWR10LE-NEXT:    vminud v2, v2, v4
+; PWR10LE-NEXT:    vminud v3, v3, v5
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    vminud v2, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    vminud v3, v3, v7
+; PWR10BE-NEXT:    vminud v5, v5, v9
+; PWR10BE-NEXT:    vminud v2, v2, v6
+; PWR10BE-NEXT:    vminud v4, v4, v8
+; PWR10BE-NEXT:    vminud v2, v2, v4
+; PWR10BE-NEXT:    vminud v3, v3, v5
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    vminud v2, v2, v3
+; PWR10BE-NEXT:    mfvsrd r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE
+; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE
+
+;;
+;; Vectors of type i32
+;;
+define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxspltw vs0, v2, 2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlxor v2, v2, vs0
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxspltw vs0, v2, 1
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlxor v2, v2, vs0
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxspltw vs0, v2, 2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlxor v2, v2, vs0
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxspltw vs0, v2, 1
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlxor v2, v2, vs0
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlxor vs0, v2, v3
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlxor vs0, v2, v3
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlxor vs0, v2, v3
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlxor vs0, v2, v3
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor vs0, v2, v3
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor vs0, v2, v3
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor vs0, v2, v3
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor vs0, v2, v3
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
+  ret i32 %0
+}
+
+define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i32:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor vs0, v3, v5
+; PWR9LE-NEXT:    xxlxor vs1, v2, v4
+; PWR9LE-NEXT:    li r3, 0
+; PWR9LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR9LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9LE-NEXT:    vextuwrx r3, r3, v2
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i32:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor vs0, v3, v5
+; PWR9BE-NEXT:    xxlxor vs1, v2, v4
+; PWR9BE-NEXT:    li r3, 0
+; PWR9BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR9BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR9BE-NEXT:    vextuwlx r3, r3, v2
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i32:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor vs0, v3, v5
+; PWR10LE-NEXT:    xxlxor vs1, v2, v4
+; PWR10LE-NEXT:    li r3, 0
+; PWR10LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10LE-NEXT:    xxspltw vs1, vs0, 2
+; PWR10LE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10LE-NEXT:    vextuwrx r3, r3, v2
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i32:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor vs0, v3, v5
+; PWR10BE-NEXT:    xxlxor vs1, v2, v4
+; PWR10BE-NEXT:    li r3, 0
+; PWR10BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10BE-NEXT:    xxspltw vs1, vs0, 1
+; PWR10BE-NEXT:    xxlxor v2, vs0, vs1
+; PWR10BE-NEXT:    vextuwlx r3, r3, v2
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %a)
+  ret i32 %0
+}
+
+declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) #0
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) #0
+declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) #0
+declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) #0
+
+;;
+;; Vectors of type i64
+;;
+define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v2i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxswapd v3, v2
+; PWR9LE-NEXT:    xxlxor vs0, v2, v3
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v2i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxswapd v3, v2
+; PWR9BE-NEXT:    xxlxor vs0, v2, v3
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v2i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxswapd v3, v2
+; PWR10LE-NEXT:    xxlxor vs0, v2, v3
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v2i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxswapd v3, v2
+; PWR10BE-NEXT:    xxlxor vs0, v2, v3
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v4i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor vs0, v2, v3
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v4i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor vs0, v2, v3
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v4i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor vs0, v2, v3
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v4i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor vs0, v2, v3
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v8i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor vs0, v3, v5
+; PWR9LE-NEXT:    xxlxor vs1, v2, v4
+; PWR9LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v8i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor vs0, v3, v5
+; PWR9BE-NEXT:    xxlxor vs1, v2, v4
+; PWR9BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v8i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor vs0, v3, v5
+; PWR10LE-NEXT:    xxlxor vs1, v2, v4
+; PWR10LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v8i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor vs0, v3, v5
+; PWR10BE-NEXT:    xxlxor vs1, v2, v4
+; PWR10BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %a)
+  ret i64 %0
+}
+
+define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 {
+; PWR9LE-LABEL: v16i64:
+; PWR9LE:       # %bb.0: # %entry
+; PWR9LE-NEXT:    xxlxor vs0, v4, v8
+; PWR9LE-NEXT:    xxlxor vs1, v2, v6
+; PWR9LE-NEXT:    xxlxor vs2, v5, v9
+; PWR9LE-NEXT:    xxlxor vs3, v3, v7
+; PWR9LE-NEXT:    xxlxor vs2, vs3, vs2
+; PWR9LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, vs2
+; PWR9LE-NEXT:    xxswapd v2, vs0
+; PWR9LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9LE-NEXT:    mfvsrld r3, vs0
+; PWR9LE-NEXT:    blr
+;
+; PWR9BE-LABEL: v16i64:
+; PWR9BE:       # %bb.0: # %entry
+; PWR9BE-NEXT:    xxlxor vs0, v4, v8
+; PWR9BE-NEXT:    xxlxor vs1, v2, v6
+; PWR9BE-NEXT:    xxlxor vs2, v5, v9
+; PWR9BE-NEXT:    xxlxor vs3, v3, v7
+; PWR9BE-NEXT:    xxlxor vs2, vs3, vs2
+; PWR9BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, vs2
+; PWR9BE-NEXT:    xxswapd v2, vs0
+; PWR9BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR9BE-NEXT:    mffprd r3, f0
+; PWR9BE-NEXT:    blr
+;
+; PWR10LE-LABEL: v16i64:
+; PWR10LE:       # %bb.0: # %entry
+; PWR10LE-NEXT:    xxlxor vs0, v4, v8
+; PWR10LE-NEXT:    xxlxor vs1, v2, v6
+; PWR10LE-NEXT:    xxlxor vs2, v5, v9
+; PWR10LE-NEXT:    xxlxor vs3, v3, v7
+; PWR10LE-NEXT:    xxlxor vs2, vs3, vs2
+; PWR10LE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, vs2
+; PWR10LE-NEXT:    xxswapd v2, vs0
+; PWR10LE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10LE-NEXT:    mfvsrld r3, vs0
+; PWR10LE-NEXT:    blr
+;
+; PWR10BE-LABEL: v16i64:
+; PWR10BE:       # %bb.0: # %entry
+; PWR10BE-NEXT:    xxlxor vs0, v4, v8
+; PWR10BE-NEXT:    xxlxor vs1, v2, v6
+; PWR10BE-NEXT:    xxlxor vs2, v5, v9
+; PWR10BE-NEXT:    xxlxor vs3, v3, v7
+; PWR10BE-NEXT:    xxlxor vs2, vs3, vs2
+; PWR10BE-NEXT:    xxlxor vs0, vs1, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, vs2
+; PWR10BE-NEXT:    xxswapd v2, vs0
+; PWR10BE-NEXT:    xxlxor vs0, vs0, v2
+; PWR10BE-NEXT:    mffprd r3, f0
+; PWR10BE-NEXT:    blr
+entry:
+  %0 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %a)
+  ret i64 %0
+}
+
+declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) #0
+declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) #0
+declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) #0
+declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) #0
+
+
+attributes #0 = { nounwind }