diff options
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 81 | ||||
-rw-r--r-- | llvm/test/Analysis/CostModel/X86/sitofp.ll | 6 | ||||
-rw-r--r-- | llvm/test/Analysis/CostModel/X86/sse-itoi.ll | 353 |
3 files changed, 435 insertions, 5 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index cf7a826ea85..d33d5758412 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -528,6 +528,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, @@ -705,7 +708,38 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; - static const TypeConversionCostTblEntry SSE2ConvTbl[] = { + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { // These are somewhat magic numbers justified by looking at the output of // Intel's IACA, running some kernels and making sure when we take // legalization into account the throughput will be overestimated. @@ -726,13 +760,42 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 14 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 7 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 31 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, }; std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); if (ST->hasSSE2() && !ST->hasAVX()) { - if (const auto *Entry = ConvertCostTableLookup(SSE2ConvTbl, ISD, + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, LTDest.second, LTSrc.second)) return LTSrc.first * Entry->Cost; } @@ -770,6 +833,20 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return Entry->Cost; } + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + return BaseT::getCastInstrCost(Opcode, Dst, Src); } diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll index 9913a489691..9f0c4065c17 100644 --- a/llvm/test/Analysis/CostModel/X86/sitofp.ll +++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll @@ -248,13 +248,13 @@ define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) { ; SSE2: cost of 20 {{.*}} sitofp ; ; AVX1-LABEL: sitofpv2i64v2double - ; AVX1: cost of 4 {{.*}} sitofp + ; AVX1: cost of 20 {{.*}} sitofp ; ; AVX2-LABEL: sitofpv2i64v2double - ; AVX2: cost of 4 {{.*}} sitofp + ; AVX2: cost of 20 {{.*}} sitofp ; ; AVX512F-LABEL: sitofpv2i64v2double - ; AVX512F: cost of 4 {{.*}} sitofp + ; AVX512F: cost of 20 {{.*}} sitofp %1 = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %1 } diff --git a/llvm/test/Analysis/CostModel/X86/sse-itoi.ll b/llvm/test/Analysis/CostModel/X86/sse-itoi.ll new file mode 100644 index 00000000000..6429e4fae09 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/sse-itoi.ll @@ -0,0 +1,353 @@ +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s + +define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { +; SSE2: zext_v16i16_to_v16i32 +; SSE2: cost of 6 {{.*}} zext +; +; SSE41: zext_v16i16_to_v16i32 +; SSE41: cost of 4 {{.*}} zext +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = zext <16 x i16> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { +; SSE2: sext_v16i16_to_v16i32 +; SSE2: cost of 8 {{.*}} sext +; +; SSE41: sext_v16i16_to_v16i32 +; SSE41: cost of 4 {{.*}} sext +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = sext <16 x i16> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @zext_v8i16_to_v8i32(<8 x i16>* %a) { +; SSE2: zext_v8i16_to_v8i32 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v8i16_to_v8i32 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = zext <8 x i16> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { +; SSE2: sext_v8i16_to_v8i32 +; SSE2: cost of 4 {{.*}} sext +; +; SSE41: sext_v8i16_to_v8i32 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = sext <8 x i16> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @zext_v4i16_to_v4i32(<4 x i16>* %a) { +; SSE2: zext_v4i16_to_v4i32 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v4i16_to_v4i32 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @sext_v4i16_to_v4i32(<4 x i16>* %a) { +; SSE2: sext_v4i16_to_v4i32 +; SSE2: cost of 2 {{.*}} sext +; +; SSE41: sext_v4i16_to_v4i32 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = sext <4 x i16> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { +; SSE2: zext_v16i8_to_v16i32 +; SSE2: cost of 9 {{.*}} zext +; +; SSE41: zext_v16i8_to_v16i32 +; SSE41: cost of 4 {{.*}} zext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { +; SSE2: sext_v16i8_to_v16i32 +; SSE2: cost of 12 {{.*}} sext +; +; SSE41: sext_v16i8_to_v16i32 +; SSE41: cost of 4 {{.*}} sext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = sext <16 x i8> %1 to <16 x i32> + store <16 x i32> %2, <16 x i32>* undef, align 4 + ret void +} + +define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { +; SSE2: zext_v8i8_to_v8i32 +; SSE2: cost of 6 {{.*}} zext +; +; SSE41: zext_v8i8_to_v8i32 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @sext_v8i8_to_v8i32(<8 x i8>* %a) { +; SSE2: sext_v8i8_to_v8i32 +; SSE2: cost of 6 {{.*}} sext +; +; SSE41: sext_v8i8_to_v8i32 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = sext <8 x i8> %1 to <8 x i32> + store <8 x i32> %2, <8 x i32>* undef, align 4 + ret void +} + +define void @zext_v4i8_to_v4i32(<4 x i8>* %a) { +; SSE2: zext_v4i8_to_v4i32 +; SSE2: cost of 2 {{.*}} zext +; +; SSE41: zext_v4i8_to_v4i32 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @sext_v4i8_to_v4i32(<4 x i8>* %a) { +; SSE2: sext_v4i8_to_v4i32 +; SSE2: cost of 3 {{.*}} sext +; +; SSE41: sext_v4i8_to_v4i32 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = sext <4 x i8> %1 to <4 x i32> + store <4 x i32> %2, <4 x i32>* undef, align 4 + ret void +} + +define void @zext_v16i8_to_v16i16(<16 x i8>* %a) { +; SSE2: zext_v16i8_to_v16i16 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v16i8_to_v16i16 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @sext_v16i8_to_v16i16(<16 x i8>* %a) { +; SSE2: sext_v16i8_to_v16i16 +; SSE2: cost of 4 {{.*}} sext +; +; SSE41: sext_v16i8_to_v16i16 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = sext <16 x i8> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @zext_v8i8_to_v8i16(<8 x i8>* %a) { +; SSE2: zext_v8i8_to_v8i16 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v8i8_to_v8i16 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @sext_v8i8_to_v8i16(<8 x i8>* %a) { +; SSE2: sext_v8i8_to_v8i16 +; SSE2: cost of 2 {{.*}} sext +; +; SSE41: sext_v8i8_to_v8i16 +; SSE41: cost of 1 {{.*}} sext +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = sext <8 x i8> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @zext_v4i8_to_v4i16(<4 x i8>* %a) { +; SSE2: zext_v4i8_to_v4i16 +; SSE2: cost of 1 {{.*}} zext +; +; SSE41: zext_v4i8_to_v4i16 +; SSE41: cost of 1 {{.*}} zext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @sext_v4i8_to_v4i16(<4 x i8>* %a) { +; SSE2: sext_v4i8_to_v4i16 +; SSE2: cost of 6 {{.*}} sext +; +; SSE41: sext_v4i8_to_v4i16 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = sext <4 x i8> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) { +; SSE2: truncate_v16i32_to_v16i16 +; SSE2: cost of 14 {{.*}} trunc +; +; SSE41: truncate_v16i32_to_v16i16 +; SSE41: cost of 6 {{.*}} trunc +; + %1 = load <16 x i32>, <16 x i32>* %a + %2 = trunc <16 x i32> %1 to <16 x i16> + store <16 x i16> %2, <16 x i16>* undef, align 4 + ret void +} + +define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) { +; SSE2: truncate_v8i32_to_v8i16 +; SSE2: cost of 7 {{.*}} trunc +; +; SSE41: truncate_v8i32_to_v8i16 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <8 x i32>, <8 x i32>* %a + %2 = trunc <8 x i32> %1 to <8 x i16> + store <8 x i16> %2, <8 x i16>* undef, align 4 + ret void +} + +define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) { +; SSE2: truncate_v4i32_to_v4i16 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v4i32_to_v4i16 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = trunc <4 x i32> %1 to <4 x i16> + store <4 x i16> %2, <4 x i16>* undef, align 4 + ret void +} + +define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) { +; SSE2: truncate_v16i32_to_v16i8 +; SSE2: cost of 31 {{.*}} trunc +; +; SSE41: truncate_v16i32_to_v16i8 +; SSE41: cost of 30 {{.*}} trunc +; + %1 = load <16 x i32>, <16 x i32>* %a + %2 = trunc <16 x i32> %1 to <16 x i8> + store <16 x i8> %2, <16 x i8>* undef, align 4 + ret void +} + +define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) { +; SSE2: truncate_v8i32_to_v8i8 +; SSE2: cost of 4 {{.*}} trunc +; +; SSE41: truncate_v8i32_to_v8i8 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <8 x i32>, <8 x i32>* %a + %2 = trunc <8 x i32> %1 to <8 x i8> + store <8 x i8> %2, <8 x i8>* undef, align 4 + ret void +} + +define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) { +; SSE2: truncate_v4i32_to_v4i8 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v4i32_to_v4i8 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = trunc <4 x i32> %1 to <4 x i8> + store <4 x i8> %2, <4 x i8>* undef, align 4 + ret void +} + +define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) { +; SSE2: truncate_v16i16_to_v16i8 +; SSE2: cost of 3 {{.*}} trunc +; +; SSE41: truncate_v16i16_to_v16i8 +; SSE41: cost of 3 {{.*}} trunc +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = trunc <16 x i16> %1 to <16 x i8> + store <16 x i8> %2, <16 x i8>* undef, align 4 + ret void +} + +define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) { +; SSE2: truncate_v8i16_to_v8i8 +; SSE2: cost of 2 {{.*}} trunc +; +; SSE41: truncate_v8i16_to_v8i8 +; SSE41: cost of 1 {{.*}} trunc +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = trunc <8 x i16> %1 to <8 x i8> + store <8 x i8> %2, <8 x i8>* undef, align 4 + ret void +} + +define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) { +; SSE2: truncate_v4i16_to_v4i8 +; SSE2: cost of 4 {{.*}} trunc +; +; SSE41: truncate_v4i16_to_v4i8 +; SSE41: cost of 2 {{.*}} trunc +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = trunc <4 x i16> %1 to <4 x i8> + store <4 x i8> %2, <4 x i8>* undef, align 4 + ret void +} |