diff options
| author | Mikhail Dvoretckii <mikhail.dvoretckii@intel.com> | 2018-06-19 10:49:12 +0000 |
|---|---|---|
| committer | Mikhail Dvoretckii <mikhail.dvoretckii@intel.com> | 2018-06-19 10:49:12 +0000 |
| commit | 8393f907176c6759de726189ed79b37dec24fc0e (patch) | |
| tree | 7adfaa5a6e52cebfdd46d695760652122c87a662 /llvm/test | |
| parent | 6780b5f97d28ff1f0266fc3780a35396db286e7a (diff) | |
| download | bcm5719-llvm-8393f907176c6759de726189ed79b37dec24fc0e.tar.gz bcm5719-llvm-8393f907176c6759de726189ed79b37dec24fc0e.zip | |
[InstCombine] Replacing X86-specific rounding intrinsics with generic floor-ceil
This patch replaces calls to X86-specific intrinsics with floor-ceil semantics
with calls to target-independent @llvm.floor.* and @llvm.ceil.* intrinsics. This
doesn't affect the resulting machine code, as those intrinsics are lowered to
the same instructions, but exposes these specific rounding cases to generic
optimizations.
Differential Revision: https://reviews.llvm.org/D48067
llvm-svn: 335039
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/Transforms/InstCombine/X86/x86-avx.ll | 41 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/X86/x86-avx512.ll | 207 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/X86/x86-sse41.ll | 44 |
3 files changed, 292 insertions, 0 deletions
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx.ll new file mode 100644 index 00000000000..bad27d1e0c4 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) + +define <8 x float> @test_round_ps_floor(<8 x float> %a) { +; CHECK-LABEL: @test_round_ps_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A:%.*]]) +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 1) + ret <8 x float> %1 +} + +define <8 x float> @test_round_ps_ceil(<8 x float> %a) { +; CHECK-LABEL: @test_round_ps_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A:%.*]]) +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 2) + ret <8 x float> %1 +} + +define <4 x double> @test_round_pd_floor(<4 x double> %a) { +; CHECK-LABEL: @test_round_pd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[A:%.*]]) +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 1) + ret <4 x double> %1 +} + +define <4 x double> @test_round_pd_ceil(<4 x double> %a) { +; CHECK-LABEL: @test_round_pd_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[A:%.*]]) +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 2) + ret <4 x double> %1 +} diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index 4a5ae932ca1..32612ba36f1 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -916,6 +916,213 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) +declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8) +declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8) +declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) +declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) +declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8) +declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <4 x float> @test_rndscale_ss_floor(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ss_floor( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.floor.f32(float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP7]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 1, i32 4) + ret <4 x float> %1 +} + +define <4 x float> @test_rndscale_ss_ceil(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ss_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.ceil.f32(float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP7]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 2, i32 4) + ret <4 x float> %1 +} + +define <2 x double> @test_rndscale_sd_floor(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_sd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.floor.f64(double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP7]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 1, i32 4) + ret <2 x double> %1 +} + +define <2 x double> @test_rndscale_sd_ceil(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_sd_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.ceil.f64(double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP7]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 2, i32 4) + ret <2 x double> %1 +} + +define <4 x float> @test_rndscale_ps_128_floor(<4 x float> %src, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_128_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 1, <4 x float> %dst, i8 %k) + ret <4 x float> %1 +} + +define <4 x float> @test_rndscale_ps_128_ceil(<4 x float> %src, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_128_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 2, <4 x float> %dst, i8 %k) + ret <4 x float> %1 +} + +define <8 x float> @test_rndscale_ps_256_floor(<8 x float> %src, <8 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_256_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 1, <8 x float> %dst, i8 %k) + ret <8 x float> %1 +} + +define <8 x float> @test_rndscale_ps_256_ceil(<8 x float> %src, <8 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_256_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 2, <8 x float> %dst, i8 %k) + ret <8 x float> %1 +} + +define <16 x float> @test_rndscale_ps_512_floor(<16 x float> %src, <16 x float> %dst, i16 %k) { +; CHECK-LABEL: @test_rndscale_ps_512_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 1, <16 x float> %dst, i16 %k, i32 4) + ret <16 x float> %1 +} + +define <16 x float> @test_rndscale_ps_512_ceil(<16 x float> %src, <16 x float> %dst, i16 %k) { +; CHECK-LABEL: @test_rndscale_ps_512_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 2, <16 x float> %dst, i16 %k, i32 4) + ret <16 x float> %1 +} + +define <2 x double> @test_rndscale_pd_128_floor(<2 x double> %src, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_128_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1> +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]] +; CHECK-NEXT: ret <2 x double> [[TMP4]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 1, <2 x double> %dst, i8 %k) + ret <2 x double> %1 +} + +define <2 x double> @test_rndscale_pd_128_ceil(<2 x double> %src, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_128_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1> +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]] +; CHECK-NEXT: ret <2 x double> [[TMP4]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 2, <2 x double> %dst, i8 %k) + ret <2 x double> %1 +} + +define <4 x double> @test_rndscale_pd_256_floor(<4 x double> %src, <4 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_256_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP4]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 1, <4 x double> %dst, i8 %k) + ret <4 x double> %1 +} + +define <4 x double> @test_rndscale_pd_256_ceil(<4 x double> %src, <4 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_256_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP4]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 2, <4 x double> %dst, i8 %k) + ret <4 x double> %1 +} + +define <8 x double> @test_rndscale_pd_512_floor(<8 x double> %src, <8 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_512_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 1, <8 x double> %dst, i8 %k, i32 4) + ret <8 x double> %1 +} + +define <8 x double> @test_rndscale_pd_512_ceil(<8 x double> %src, <8 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_512_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 2, <8 x double> %dst, i8 %k, i32 4) + ret <8 x double> %1 +} + declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll index f95b1b4d552..ddc3b7372ea 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll @@ -13,6 +13,28 @@ define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } +define <2 x double> @test_round_sd_floor(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_round_sd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.floor.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP2]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 1) + ret <2 x double> %1 +} + +define <2 x double> @test_round_sd_ceil(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_round_sd_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.ceil.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP2]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 2) + ret <2 x double> %1 +} + define double @test_round_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_round_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0 @@ -57,6 +79,28 @@ define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %7 } +define <4 x float> @test_round_ss_floor(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_round_ss_floor( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.floor.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[A:%.*]], float [[TMP2]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a, <4 x float> %b, i32 1) + ret <4 x float> %1 +} + +define <4 x float> @test_round_ss_ceil(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_round_ss_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.ceil.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[A:%.*]], float [[TMP2]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a, <4 x float> %b, i32 2) + ret <4 x float> %1 +} + define float @test_round_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_round_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 |

