Skip to content

Commit b1ce776

Browse files
committedJun 19, 2018
[X86] VRNDSCALE* folding from masked and scalar ffloor and fceil patterns
This patch handles back-end folding of generic patterns created by lowering the X86 rounding intrinsics to native IR in cases where the instruction isn't a straightforward packed values rounding operation, but a masked operation or a scalar operation. Differential Revision: https://reviews.llvm.org/D45203 llvm-svn: 335037
1 parent e6a9c24 commit b1ce776

File tree

4 files changed

+2252
-6
lines changed

4 files changed

+2252
-6
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+25-3
Original file line numberDiff line numberDiff line change
@@ -39121,9 +39121,31 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
3912139121
// TODO: SimplifyDemandedBits instead?
3912239122
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
3912339123
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39124-
if (C->getAPIntValue().isOneValue())
39125-
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39126-
Src.getOperand(0));
39124+
if (C->getAPIntValue().isOneValue()) {
39125+
SDValue Mask = Src.getOperand(0);
39126+
if (Mask.getOpcode() == ISD::TRUNCATE &&
39127+
Mask.getOperand(0).getValueType() != MVT::i16)
39128+
Mask = Mask.getOperand(0);
39129+
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask);
39130+
}
39131+
39132+
// The result of AND may also be truncated. This occurs in code for lowered
39133+
// masked scalar intrinsics.
39134+
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39135+
Src.getOperand(0).getOpcode() == ISD::AND &&
39136+
Src.getOperand(0).hasOneUse())
39137+
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(0).getOperand(1)))
39138+
if (C->getAPIntValue().isOneValue()) {
39139+
SDValue Mask = Src.getOperand(0).getOperand(0);
39140+
if (Mask.getOpcode() == ISD::TRUNCATE &&
39141+
Mask.getOperand(0).getValueType() != MVT::i16)
39142+
Mask = Mask.getOperand(0);
39143+
// Check if the initial value is an i16. scalar_to_vector fails to
39144+
// select for that type, so the combine should be aborted.
39145+
if (Mask.getValueType() == MVT::i16)
39146+
return SDValue();
39147+
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask);
39148+
}
3912739149

3912839150
return SDValue();
3912939151
}

‎llvm/lib/Target/X86/X86InstrAVX512.td

+84-2
Original file line numberDiff line numberDiff line change
@@ -8781,16 +8781,50 @@ multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move
87818781
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
87828782
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
87838783
(extractelt _.VT:$dst, (iPTR 0))))),
8784-
(!cast<Instruction>("V"#OpcPrefix#r_Intk)
8784+
(!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
87858785
_.VT:$dst, OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
87868786

87878787
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
87888788
(OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
8789-
(!cast<Instruction>("V"#OpcPrefix#r_Intkz)
8789+
(!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
87908790
OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
87918791
}
87928792
}
87938793

8794+
defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
8795+
(v1i1 (scalar_to_vector GR32:$mask)),
8796+
v4f32x_info, fp32imm0, 0x01,
8797+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8798+
defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
8799+
(v1i1 (scalar_to_vector GR8:$mask)),
8800+
v4f32x_info, fp32imm0, 0x01,
8801+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8802+
defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
8803+
(v1i1 (scalar_to_vector GR32:$mask)),
8804+
v4f32x_info, fp32imm0, 0x02,
8805+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8806+
defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
8807+
(v1i1 (scalar_to_vector GR8:$mask)),
8808+
v4f32x_info, fp32imm0, 0x02,
8809+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8810+
defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
8811+
(v1i1 (scalar_to_vector GR32:$mask)),
8812+
v2f64x_info, fp64imm0, 0x01,
8813+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8814+
defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
8815+
(v1i1 (scalar_to_vector GR8:$mask)),
8816+
v2f64x_info, fp64imm0, 0x01,
8817+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8818+
defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
8819+
(v1i1 (scalar_to_vector GR32:$mask)),
8820+
v2f64x_info, fp64imm0, 0x02,
8821+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8822+
defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
8823+
(v1i1 (scalar_to_vector GR8:$mask)),
8824+
v2f64x_info, fp64imm0, 0x02,
8825+
(COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
8826+
8827+
87948828
//-------------------------------------------------
87958829
// Integer truncate and extend operations
87968830
//-------------------------------------------------
@@ -9936,10 +9970,18 @@ defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
99369970
let Predicates = [HasAVX512] in {
99379971
def : Pat<(v16f32 (ffloor VR512:$src)),
99389972
(VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
9973+
def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), VR512:$dst)),
9974+
(VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0x9))>;
9975+
def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), v16f32_info.ImmAllZerosV)),
9976+
(VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0x9))>;
99399977
def : Pat<(v16f32 (fnearbyint VR512:$src)),
99409978
(VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
99419979
def : Pat<(v16f32 (fceil VR512:$src)),
99429980
(VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
9981+
def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), VR512:$dst)),
9982+
(VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0xA))>;
9983+
def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), v16f32_info.ImmAllZerosV)),
9984+
(VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0xA))>;
99439985
def : Pat<(v16f32 (frint VR512:$src)),
99449986
(VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
99459987
def : Pat<(v16f32 (ftrunc VR512:$src)),
@@ -9958,10 +10000,18 @@ def : Pat<(v16f32 (ftrunc (loadv16f32 addr:$src))),
995810000

995910001
def : Pat<(v8f64 (ffloor VR512:$src)),
996010002
(VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
10003+
def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), VR512:$dst)),
10004+
(VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0x9))>;
10005+
def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), v8f64_info.ImmAllZerosV)),
10006+
(VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0x9))>;
996110007
def : Pat<(v8f64 (fnearbyint VR512:$src)),
996210008
(VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
996310009
def : Pat<(v8f64 (fceil VR512:$src)),
996410010
(VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
10011+
def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), VR512:$dst)),
10012+
(VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0xA))>;
10013+
def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), v8f64_info.ImmAllZerosV)),
10014+
(VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0xA))>;
996510015
def : Pat<(v8f64 (frint VR512:$src)),
996610016
(VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
996710017
def : Pat<(v8f64 (ftrunc VR512:$src)),
@@ -9982,10 +10032,18 @@ def : Pat<(v8f64 (ftrunc (loadv8f64 addr:$src))),
998210032
let Predicates = [HasVLX] in {
998310033
def : Pat<(v4f32 (ffloor VR128X:$src)),
998410034
(VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
10035+
def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
10036+
(VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0x9))>;
10037+
def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), v4f32x_info.ImmAllZerosV)),
10038+
(VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0x9))>;
998510039
def : Pat<(v4f32 (fnearbyint VR128X:$src)),
998610040
(VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
998710041
def : Pat<(v4f32 (fceil VR128X:$src)),
998810042
(VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
10043+
def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
10044+
(VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0xA))>;
10045+
def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), v4f32x_info.ImmAllZerosV)),
10046+
(VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0xA))>;
998910047
def : Pat<(v4f32 (frint VR128X:$src)),
999010048
(VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
999110049
def : Pat<(v4f32 (ftrunc VR128X:$src)),
@@ -10004,10 +10062,18 @@ def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
1000410062

1000510063
def : Pat<(v2f64 (ffloor VR128X:$src)),
1000610064
(VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
10065+
def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
10066+
(VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0x9))>;
10067+
def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), v2f64x_info.ImmAllZerosV)),
10068+
(VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0x9))>;
1000710069
def : Pat<(v2f64 (fnearbyint VR128X:$src)),
1000810070
(VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
1000910071
def : Pat<(v2f64 (fceil VR128X:$src)),
1001010072
(VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
10073+
def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
10074+
(VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0xA))>;
10075+
def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), v2f64x_info.ImmAllZerosV)),
10076+
(VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0xA))>;
1001110077
def : Pat<(v2f64 (frint VR128X:$src)),
1001210078
(VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
1001310079
def : Pat<(v2f64 (ftrunc VR128X:$src)),
@@ -10026,10 +10092,18 @@ def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
1002610092

1002710093
def : Pat<(v8f32 (ffloor VR256X:$src)),
1002810094
(VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
10095+
def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
10096+
(VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0x9))>;
10097+
def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), v8f32x_info.ImmAllZerosV)),
10098+
(VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0x9))>;
1002910099
def : Pat<(v8f32 (fnearbyint VR256X:$src)),
1003010100
(VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
1003110101
def : Pat<(v8f32 (fceil VR256X:$src)),
1003210102
(VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
10103+
def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
10104+
(VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0xA))>;
10105+
def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), v8f32x_info.ImmAllZerosV)),
10106+
(VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0xA))>;
1003310107
def : Pat<(v8f32 (frint VR256X:$src)),
1003410108
(VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
1003510109
def : Pat<(v8f32 (ftrunc VR256X:$src)),
@@ -10048,10 +10122,18 @@ def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
1004810122

1004910123
def : Pat<(v4f64 (ffloor VR256X:$src)),
1005010124
(VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
10125+
def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
10126+
(VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0x9))>;
10127+
def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), v4f64x_info.ImmAllZerosV)),
10128+
(VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0x9))>;
1005110129
def : Pat<(v4f64 (fnearbyint VR256X:$src)),
1005210130
(VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
1005310131
def : Pat<(v4f64 (fceil VR256X:$src)),
1005410132
(VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
10133+
def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
10134+
(VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0xA))>;
10135+
def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), v4f64x_info.ImmAllZerosV)),
10136+
(VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0xA))>;
1005510137
def : Pat<(v4f64 (frint VR256X:$src)),
1005610138
(VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
1005710139
def : Pat<(v4f64 (ftrunc VR256X:$src)),

‎llvm/lib/Target/X86/X86InstrSSE.td

+9
Original file line numberDiff line numberDiff line change
@@ -5944,6 +5944,15 @@ let Predicates = [UseSSE41] in {
59445944
(ROUNDPDm addr:$src, (i32 0xB))>;
59455945
}
59465946

5947+
defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
5948+
v4f32, 0x01, UseSSE41>;
5949+
defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
5950+
v4f32, 0x02, UseSSE41>;
5951+
defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
5952+
v2f64, 0x01, UseSSE41>;
5953+
defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
5954+
v2f64, 0x02, UseSSE41>;
5955+
59475956
//===----------------------------------------------------------------------===//
59485957
// SSE4.1 - Packed Bit Test
59495958
//===----------------------------------------------------------------------===//

‎llvm/test/CodeGen/X86/vec_floor.ll

+2,134-1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.