@@ -50,6 +50,8 @@ using namespace llvm;
50
50
51
51
#define DEBUG_TYPE " x86tti"
52
52
53
+ extern cl::opt<bool > ExperimentalVectorWideningLegalization;
54
+
53
55
// ===----------------------------------------------------------------------===//
54
56
//
55
57
// X86 cost model.
@@ -918,7 +920,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
918
920
// FIXME: We can use permq for 64-bit or larger extracts from 256-bit
919
921
// vectors.
920
922
int OrigSubElts = SubTp->getVectorNumElements ();
921
- if (NumSubElts > OrigSubElts &&
923
+ if (ExperimentalVectorWideningLegalization &&
924
+ NumSubElts > OrigSubElts &&
922
925
(Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
923
926
LT.second .getVectorElementType () ==
924
927
SubLT.second .getVectorElementType () &&
@@ -1330,6 +1333,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1330
1333
// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1331
1334
// 256-bit wide vectors.
1332
1335
1336
+ // Used with widening legalization
1337
+ static const TypeConversionCostTblEntry AVX512FConversionTblWide[] = {
1338
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1339
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1340
+ };
1341
+
1333
1342
static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1334
1343
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1335
1344
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
@@ -1347,8 +1356,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1347
1356
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1348
1357
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1349
1358
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1350
- { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1351
- { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1352
1359
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1353
1360
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1354
1361
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
@@ -1401,19 +1408,28 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1401
1408
{ ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 },
1402
1409
};
1403
1410
1411
+ static const TypeConversionCostTblEntry AVX2ConversionTblWide[] = {
1412
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1413
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1414
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1415
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1416
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1417
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1418
+ };
1419
+
1404
1420
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1405
1421
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1406
1422
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
1407
1423
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1408
1424
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
1409
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1410
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
1411
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1412
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
1425
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
1426
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
1427
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1428
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1413
1429
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1414
1430
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
1415
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1416
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
1431
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1432
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1417
1433
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1418
1434
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
1419
1435
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
@@ -1432,18 +1448,24 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1432
1448
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
1433
1449
};
1434
1450
1451
+ static const TypeConversionCostTblEntry AVXConversionTblWide[] = {
1452
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1453
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1454
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
1455
+ };
1456
+
1435
1457
static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1436
1458
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
1437
1459
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
1438
1460
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
1439
1461
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
1440
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1462
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
1441
1463
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
1442
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1464
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
1443
1465
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
1444
1466
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1445
1467
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1446
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
1468
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
1447
1469
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1448
1470
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
1449
1471
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
@@ -1642,18 +1664,35 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1642
1664
SimpleDstTy, SimpleSrcTy))
1643
1665
return Entry->Cost ;
1644
1666
1667
+ if (ST->hasAVX512 () && ExperimentalVectorWideningLegalization)
1668
+ if (const auto *Entry = ConvertCostTableLookup (AVX512FConversionTblWide, ISD,
1669
+ SimpleDstTy, SimpleSrcTy))
1670
+ return Entry->Cost ;
1671
+
1645
1672
if (ST->hasAVX512 ())
1646
1673
if (const auto *Entry = ConvertCostTableLookup (AVX512FConversionTbl, ISD,
1647
1674
SimpleDstTy, SimpleSrcTy))
1648
1675
return Entry->Cost ;
1649
1676
}
1650
1677
1678
+ if (ST->hasAVX2 () && ExperimentalVectorWideningLegalization) {
1679
+ if (const auto *Entry = ConvertCostTableLookup (AVX2ConversionTblWide, ISD,
1680
+ SimpleDstTy, SimpleSrcTy))
1681
+ return Entry->Cost ;
1682
+ }
1683
+
1651
1684
if (ST->hasAVX2 ()) {
1652
1685
if (const auto *Entry = ConvertCostTableLookup (AVX2ConversionTbl, ISD,
1653
1686
SimpleDstTy, SimpleSrcTy))
1654
1687
return Entry->Cost ;
1655
1688
}
1656
1689
1690
+ if (ST->hasAVX () && ExperimentalVectorWideningLegalization) {
1691
+ if (const auto *Entry = ConvertCostTableLookup (AVXConversionTblWide, ISD,
1692
+ SimpleDstTy, SimpleSrcTy))
1693
+ return Entry->Cost ;
1694
+ }
1695
+
1657
1696
if (ST->hasAVX ()) {
1658
1697
if (const auto *Entry = ConvertCostTableLookup (AVXConversionTbl, ISD,
1659
1698
SimpleDstTy, SimpleSrcTy))
@@ -2520,7 +2559,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2520
2559
// in the table.
2521
2560
// FIXME: Is there a better way to do this?
2522
2561
EVT VT = TLI->getValueType (DL, ValTy);
2523
- if (VT.isSimple ()) {
2562
+ if (VT.isSimple () && ExperimentalVectorWideningLegalization ) {
2524
2563
MVT MTy = VT.getSimpleVT ();
2525
2564
if (IsPairwise) {
2526
2565
if (ST->hasAVX ())
0 commit comments