diff options
author | Roman Lebedev <lebedev.ri@gmail.com> | 2019-07-30 08:00:49 +0000 |
---|---|---|
committer | Roman Lebedev <lebedev.ri@gmail.com> | 2019-07-30 08:00:49 +0000 |
commit | c197732e39931e1da6ed3c16b216b3afeaffdc15 (patch) | |
tree | e6bd33e0dfe740c54065212e1c58b519d56630e7 | |
parent | 58aa6a87a61970a43102111d68890bb10cb80845 (diff) | |
download | bcm5719-llvm-c197732e39931e1da6ed3c16b216b3afeaffdc15.tar.gz bcm5719-llvm-c197732e39931e1da6ed3c16b216b3afeaffdc15.zip |
[NFC][X86][AArch64] Revisit test coverage for X s% C == 0 fold - add tests for negative divisors, INT_MIN divisors
As discussed in the review, that fold is only valid for positive
divisors, so while we can negate negative divisors,
we have to special-case INT_MIN.
llvm-svn: 367294
-rw-r--r-- | llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll | 270 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll | 107 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/srem-seteq.ll | 81 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll | 238 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll | 107 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/urem-seteq.ll | 79 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll | 431 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll | 357 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/srem-seteq.ll | 133 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll | 306 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 261 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/urem-seteq.ll | 117 |
13 files changed, 2126 insertions, 364 deletions
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll index 6a9a9225183..20c5efcf7f3 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -; At the moment, BuildSREMEqFold does not handle nonsplat vectors. - ; Odd+Even divisors define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_even: @@ -425,115 +423,103 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -;==============================================================================; +;------------------------------------------------------------------------------; -; One all-ones divisor and power-of-two divisor divisor in odd divisor -define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: +; One INT_MIN divisor in odd divisor +define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] ; CHECK-NEXT: adrp x8, .LCPI13_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_3] -; CHECK-NEXT: adrp x8, .LCPI13_4 ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_4] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_3] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5> + %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and power-of-two divisor divisor in even divisor -define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: +; One INT_MIN divisor in even divisor +define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: adrp x8, .LCPI14_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] ; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_3] -; CHECK-NEXT: adrp x8, .LCPI14_4 ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_4] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14> + %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and power-of-two divisor divisor in odd+even divisor -define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: +; One INT_MIN divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: adrp x8, .LCPI15_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] ; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_3] -; CHECK-NEXT: adrp x8, .LCPI15_4 ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_4] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100> + %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -;------------------------------------------------------------------------------; +;==============================================================================; -; One all-ones divisor and one one divisor in odd divisor -define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] @@ -559,15 +545,15 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5> + %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one one divisor in even divisor -define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_even_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] @@ -593,15 +579,15 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14> + %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one one divisor in odd+even divisor -define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_even_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] @@ -627,7 +613,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100> + %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret @@ -635,9 +621,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ;------------------------------------------------------------------------------; -; One power-of-two divisor divisor and one divisor in odd divisor -define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] @@ -663,15 +649,15 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5> + %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One power-of-two divisor divisor and one divisor in even divisor -define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_even_poweroftwo_and_one: +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI20_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] @@ -679,31 +665,33 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] ; CHECK-NEXT: adrp x8, .LCPI20_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: adrp x8, .LCPI20_3 ; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: adrp x8, .LCPI20_3 ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: adrp x8, .LCPI20_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s ; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14> + %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One power-of-two divisor divisor and one divisor in odd+even divisor -define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] @@ -729,7 +717,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100> + %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret @@ -737,21 +725,123 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ;------------------------------------------------------------------------------; -define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: adrp x8, .LCPI22_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] ; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_3 ; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: adrp x8, .LCPI22_4 ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: adrp x8, .LCPI23_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: adrp x8, .LCPI24_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] +; CHECK-NEXT: adrp x8, .LCPI24_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: adrp x8, .LCPI24_4 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #31 +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: adrp x8, .LCPI25_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] +; CHECK-NEXT: adrp x8, .LCPI25_2 +; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3] ; CHECK-NEXT: neg v4.4s, v4.4s ; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s @@ -772,18 +862,18 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: adrp x8, .LCPI23_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: adrp x8, .LCPI26_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] +; CHECK-NEXT: adrp x8, .LCPI26_2 ; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_3 ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] ; CHECK-NEXT: neg v4.4s, v4.4s ; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll index da260fd95b5..418175f168a 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -49,6 +49,56 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +; Negative divisors should be negated, and thus this is still splat vectors. + +; Odd divisor +define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_odd_neg25: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v1.4s, #3 +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_even_neg100: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: sshr v3.4s, v1.4s, #5 +; CHECK-NEXT: usra v3.4s, v1.4s, #31 +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;------------------------------------------------------------------------------; ; Comparison constant has undef elements. ;------------------------------------------------------------------------------; @@ -103,6 +153,27 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; +define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_one_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_one_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ; We can lower remainder of division by powers of two much better elsewhere. define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_pow2: @@ -122,36 +193,34 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; We could lower remainder of division by all-ones much better elsewhere. -define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_allones: +; We could lower remainder of division by INT_MIN much better elsewhere. +define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: movi v3.4s, #128, lsl #24 +; CHECK-NEXT: usra v2.4s, v1.4s, #1 +; CHECK-NEXT: and v1.16b, v2.16b, v3.16b +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> + %srem = srem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; If all divisors are ones, this is constant-folded. -define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_one_eq: +; We could lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_srem_allones: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_srem_one_ne: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret - %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> - %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll index 45894da89fb..e19dc442854 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -204,11 +204,50 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ret i32 %ret } +; The fold is only valid for positive divisors, negative-ones should be negated. +define i32 @test_srem_negative_odd(i32 %X) nounwind { +; CHECK-LABEL: test_srem_negative_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1717986919 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmn w0, w8 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %srem = srem i32 %X, -5 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} +define i32 @test_srem_negative_even(i32 %X) nounwind { +; CHECK-LABEL: test_srem_negative_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #56173 +; CHECK-NEXT: movk w8, #28086, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #3 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #-14 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %srem = srem i32 %X, -14 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ;------------------------------------------------------------------------------; ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. +; We can lower remainder of division by one much better elsewhere. define i32 @test_srem_one(i32 %X) nounwind { ; CHECK-LABEL: test_srem_one: ; CHECK: // %bb.0: @@ -220,33 +259,51 @@ define i32 @test_srem_one(i32 %X) nounwind { ret i32 %ret } -; We can lower remainder of division by all-ones much better elsewhere. -define i32 @test_srem_allones(i32 %X) nounwind { -; CHECK-LABEL: test_srem_allones: +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_srem_pow2(i32 %X) nounwind { +; CHECK-LABEL: test_srem_pow2: ; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #15 // =15 ; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: csel w8, w0, w0, lt +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xfffffff0 ; CHECK-NEXT: cmp w0, w8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret - %srem = srem i32 %X, 4294967295 + %srem = srem i32 %X, 16 %cmp = icmp eq i32 %srem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret } -; We can lower remainder of division by powers of two much better elsewhere. -define i32 @test_srem_pow2(i32 %X) nounwind { -; CHECK-LABEL: test_srem_pow2: +; The fold is only valid for positive divisors, and we can't negate INT_MIN. +define i32 @test_srem_int_min(i32 %X) nounwind { +; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #15 // =15 +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w0, #0 // =0 ; CHECK-NEXT: csel w8, w8, w0, lt -; CHECK-NEXT: and w8, w8, #0xfffffff0 +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: cmn w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %srem = srem i32 %X, 2147483648 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_srem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_srem_allones: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w0, w0, lt ; CHECK-NEXT: cmp w0, w8 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret - %srem = srem i32 %X, 16 + %srem = srem i32 %X, 4294967295 %cmp = icmp eq i32 %srem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll index ea098ec4526..344a077798a 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -; On AArch64, division in expensive. BuildRemEqFold should therefore run even -; when optimizing for size. Only optimizing for minimum size retains a plain div. - define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index c5b0148b99a..de3c1fafb6d 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -; At the moment, BuildUREMEqFold does not handle nonsplat vectors. - ; Odd+Even divisors define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_even: @@ -365,11 +363,11 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -;==============================================================================; +;------------------------------------------------------------------------------; -; One all-ones divisor and power-of-two divisor divisor in odd divisor -define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: +; One INT_MIN divisor in odd divisor +define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] @@ -387,15 +385,15 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5> + %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and power-of-two divisor divisor in even divisor -define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: +; One INT_MIN divisor in even divisor +define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] @@ -417,15 +415,15 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14> + %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and power-of-two divisor divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; One INT_MIN divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] @@ -433,46 +431,57 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100> + %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -;------------------------------------------------------------------------------; +;==============================================================================; -; One all-ones divisor and one one divisor in odd divisor -define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: adrp x9, .LCPI16_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: adrp x8, .LCPI16_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5> + %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one one divisor in even divisor -define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] @@ -486,26 +495,23 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s ; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3] -; CHECK-NEXT: adrp x8, .LCPI17_4 ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_4] ; CHECK-NEXT: neg v3.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14> + %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one one divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_allones_and_one: +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] @@ -513,20 +519,17 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: adrp x8, .LCPI18_3 ; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100> + %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret @@ -534,38 +537,28 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ;------------------------------------------------------------------------------; -; One power-of-two divisor divisor and one divisor in odd divisor -define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_poweroftwo_and_one: +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: adrp x9, .LCPI19_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: adrp x8, .LCPI19_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] -; CHECK-NEXT: adrp x8, .LCPI19_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_2] -; CHECK-NEXT: adrp x8, .LCPI19_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5> + %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One power-of-two divisor divisor and one divisor in even divisor -define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_poweroftwo_and_one: +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI20_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] @@ -590,15 +583,15 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14> + %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One power-of-two divisor divisor and one divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_poweroftwo_and_one: +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] @@ -619,7 +612,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100> + %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret @@ -627,8 +620,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ;------------------------------------------------------------------------------; -define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] @@ -649,14 +643,15 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1> + %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] @@ -681,6 +676,97 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: adrp x8, .LCPI24_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] +; CHECK-NEXT: adrp x8, .LCPI24_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: adrp x8, .LCPI25_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] +; CHECK-NEXT: adrp x8, .LCPI25_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: adrp x8, .LCPI26_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] +; CHECK-NEXT: adrp x8, .LCPI26_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: adrp x8, .LCPI26_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll index d544e5f9cd5..ae51708e02a 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -45,6 +45,57 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +; Negative divisors should be negated, and thus this is still splat vectors. + +; Odd divisor +define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_neg25: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: adrp x9, .LCPI2_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_neg100: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;------------------------------------------------------------------------------; ; Comparison constant has undef elements. ;------------------------------------------------------------------------------; @@ -97,6 +148,27 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; +define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_one_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_one_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ; We can lower remainder of division by powers of two much better elsewhere. define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_pow2: @@ -113,39 +185,32 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; We could lower remainder of division by all-ones much better elsewhere. -define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_allones: +; We could lower remainder of division by INT_MIN much better elsewhere. +define <4 x i32> @test_urem_int_min(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bic v0.4s, #128, lsl #24 ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> + %urem = urem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; If all divisors are ones, this is constant-folded. -define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_one_eq: +; We could lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %urem = urem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_one_ne: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> - %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll index 164c0a5f1ec..69e8e825727 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -5,10 +5,6 @@ ; Odd divisors ;------------------------------------------------------------------------------; -; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. -; The corresponding pseudocode is: -; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] -; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] define i32 @test_urem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: @@ -79,12 +75,6 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; Even divisors ;------------------------------------------------------------------------------; -; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. -; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. -; The corresponding pseudocode is: -; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] -; Q <- [Q >>rot K] <=> [Q >>rot 1] -; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: @@ -177,11 +167,41 @@ define i32 @test_urem_odd_setne(i32 %X) nounwind { ret i32 %ret } +; The fold is only valid for positive divisors, negative-ones should be negated. +define i32 @test_urem_negative_odd(i32 %X) nounwind { +; CHECK-LABEL: test_urem_negative_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #858993459 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: cmp w8, #1 // =1 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret + %urem = urem i32 %X, -5 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} +define i32 @test_urem_negative_even(i32 %X) nounwind { +; CHECK-LABEL: test_urem_negative_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37449 +; CHECK-NEXT: movk w8, #51492, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: cmp w8, #1 // =1 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret + %urem = urem i32 %X, -14 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ;------------------------------------------------------------------------------; ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. +; We can lower remainder of division by one much better elsewhere. define i32 @test_urem_one(i32 %X) nounwind { ; CHECK-LABEL: test_urem_one: ; CHECK: // %bb.0: @@ -193,28 +213,41 @@ define i32 @test_urem_one(i32 %X) nounwind { ret i32 %ret } -; We can lower remainder of division by all-ones much better elsewhere. -define i32 @test_urem_allones(i32 %X) nounwind { -; CHECK-LABEL: test_urem_allones: +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_urem_pow2(i32 %X) nounwind { +; CHECK-LABEL: test_urem_pow2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: tst w0, #0xf +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret - %urem = urem i32 %X, 4294967295 + %urem = urem i32 %X, 16 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret } -; We can lower remainder of division by powers of two much better elsewhere. -define i32 @test_urem_pow2(i32 %X) nounwind { -; CHECK-LABEL: test_urem_pow2: +; The fold is only valid for positive divisors, and we can't negate INT_MIN. +define i32 @test_urem_int_min(i32 %X) nounwind { +; CHECK-LABEL: test_urem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0xf +; CHECK-NEXT: tst w0, #0x7fffffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret - %urem = urem i32 %X, 16 + %urem = urem i32 %X, 2147483648 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_urem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_urem_allones: +; CHECK: // %bb.0: +; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 6e51b4fcacc..03ef755425d 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -1899,6 +1899,437 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +;------------------------------------------------------------------------------; + +; One INT_MIN divisor in odd divisor +define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483647,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $30, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One INT_MIN divisor in even divisor +define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $30, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One INT_MIN divisor in odd+even divisor +define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $30, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,2147483648,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $30, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_even_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;==============================================================================; ; One all-ones divisor and power-of-two divisor divisor in odd divisor diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll index 843a26998ad..2817d9e6229 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -229,6 +229,241 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +; Negative divisors should be negated, and thus this is still splat vectors. + +; Odd divisor +define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_odd_neg25: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm1 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,4294967271,4294967271,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_odd_neg25: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_odd_neg25: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_odd_neg25: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_odd_neg25: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_srem_even_neg100: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_neg100: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_even_neg100: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_even_neg100: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_even_neg100: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;------------------------------------------------------------------------------; ; Comparison constant has undef elements. ;------------------------------------------------------------------------------; @@ -459,6 +694,47 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; +define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_one_eq: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_srem_one_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_one_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_one_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq + %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_one_ne: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test_srem_one_ne: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq + %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ; We can lower remainder of division by powers of two much better elsewhere. define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { ; CHECK-SSE-LABEL: test_srem_pow2: @@ -516,71 +792,86 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; We could lower remainder of division by all-ones much better elsewhere. -define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_srem_allones: +; We could lower remainder of division by INT_MIN much better elsewhere. +define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_int_min: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE-NEXT: psrad $31, %xmm1 +; CHECK-SSE-NEXT: psrld $1, %xmm1 +; CHECK-SSE-NEXT: paddd %xmm0, %xmm1 +; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-SSE-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_srem_allones: +; CHECK-AVX1-LABEL: test_srem_int_min: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_srem_allones: +; CHECK-AVX2-LABEL: test_srem_int_min: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_srem_allones: +; CHECK-AVX512VL-LABEL: test_srem_int_min: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> + %srem = srem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; If all divisors are ones, this is constant-folded. -define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_srem_one_eq: +; We could lower remainder of division by all-ones much better elsewhere. +define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_srem_allones: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_srem_one_eq: +; CHECK-AVX1-LABEL: test_srem_allones: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_srem_one_eq: +; CHECK-AVX2-LABEL: test_srem_allones: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_srem_one_eq: +; CHECK-AVX512VL-LABEL: test_srem_allones: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq - %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295> %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_srem_one_ne: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX-LABEL: test_srem_one_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> - %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0> - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} diff --git a/llvm/test/CodeGen/X86/srem-seteq.ll b/llvm/test/CodeGen/X86/srem-seteq.ll index 0ae07a51586..b3224114e5c 100644 --- a/llvm/test/CodeGen/X86/srem-seteq.ll +++ b/llvm/test/CodeGen/X86/srem-seteq.ll @@ -359,11 +359,86 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ret i32 %ret } +; The fold is only valid for positive divisors, negative-ones should be negated. +define i32 @test_srem_negative_odd(i32 %X) nounwind { +; X86-LABEL: test_srem_negative_odd: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1717986919, %edx # imm = 0x99999999 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_negative_odd: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $-1717986919, %rcx, %rax # imm = 0x99999999 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: setne %al +; X64-NEXT: retq + %srem = srem i32 %X, -5 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} +define i32 @test_srem_negative_even(i32 %X) nounwind { +; X86-LABEL: test_srem_negative_even: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1840700269, %edx # imm = 0x6DB6DB6D +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-14, %edx, %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_negative_even: +; X64: # %bb.0: +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1840700269, %rcx, %rax # imm = 0x6DB6DB6D +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $3, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-14, %eax, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al +; X64-NEXT: retq + %srem = srem i32 %X, -14 + %cmp = icmp ne i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ;------------------------------------------------------------------------------; ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. +; We can lower remainder of division by one much better elsewhere. define i32 @test_srem_one(i32 %X) nounwind { ; CHECK-LABEL: test_srem_one: ; CHECK: # %bb.0: @@ -375,18 +450,6 @@ define i32 @test_srem_one(i32 %X) nounwind { ret i32 %ret } -; We can lower remainder of division by all-ones much better elsewhere. -define i32 @test_srem_allones(i32 %X) nounwind { -; CHECK-LABEL: test_srem_allones: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: ret{{[l|q]}} - %srem = srem i32 %X, 4294967295 - %cmp = icmp eq i32 %srem, 0 - %ret = zext i1 %cmp to i32 - ret i32 %ret -} - ; We can lower remainder of division by powers of two much better elsewhere. define i32 @test_srem_pow2(i32 %X) nounwind { ; X86-LABEL: test_srem_pow2: @@ -418,3 +481,47 @@ define i32 @test_srem_pow2(i32 %X) nounwind { %ret = zext i1 %cmp to i32 ret i32 %ret } + +; The fold is only valid for positive divisors, and we can't negate INT_MIN. +define i32 @test_srem_int_min(i32 %X) nounwind { +; X86-LABEL: test_srem_int_min: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: andl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_srem_int_min: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: sarl $31, %ecx +; X64-NEXT: shrl %ecx +; X64-NEXT: addl %edi, %ecx +; X64-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: addl %edi, %ecx +; X64-NEXT: sete %al +; X64-NEXT: retq + %srem = srem i32 %X, 2147483648 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_srem_allones(i32 %X) nounwind { +; CHECK-LABEL: test_srem_allones: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: ret{{[l|q]}} + %srem = srem i32 %X, 4294967295 + %cmp = icmp eq i32 %srem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 8717944e83e..f094eb6918f 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -5,8 +5,6 @@ ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL -; At the moment, BuildUREMEqFold does not handle nonsplat vectors. - ; Odd+Even divisors define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: @@ -1215,6 +1213,310 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +;------------------------------------------------------------------------------; + +; One INT_MIN divisor in odd divisor +define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One INT_MIN divisor in even divisor +define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One INT_MIN divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,2147483648,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;==============================================================================; ; One all-ones divisor and power-of-two divisor divisor in odd divisor diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index face44a7ce4..409f0bca24c 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -150,6 +150,144 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +; Negative divisors should be negated, and thus this is still splat vectors. + +; Odd divisor +define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_neg25: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,1030792151,1030792151,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_neg25: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX-LABEL: test_urem_odd_neg25: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq + %urem = urem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; Even divisors +define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_neg100: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $27, %xmm1 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_neg100: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm1 +; CHECK-SSE41-NEXT: psrld $27, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_neg100: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $27, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_neg100: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_neg100: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ;------------------------------------------------------------------------------; ; Comparison constant has undef elements. ;------------------------------------------------------------------------------; @@ -348,6 +486,47 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; +define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_one_eq: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_one_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_one_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_one_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} +define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_one_ne: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test_urem_one_ne: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq + %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> + %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ; We can lower remainder of division by powers of two much better elsewhere. define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { ; CHECK-SSE-LABEL: test_urem_pow2: @@ -388,6 +567,46 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +; We could lower remainder of division by INT_MIN much better elsewhere. +define <4 x i32> @test_urem_int_min(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_int_min: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_int_min: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_int_min: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_int_min: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> + %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + ; We could lower remainder of division by all-ones much better elsewhere. define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_allones: @@ -442,45 +661,3 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } - -; If all divisors are ones, this is constant-folded. -define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_urem_one_eq: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX1-LABEL: test_urem_one_eq: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_one_eq: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_one_eq: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> - %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} -define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_urem_one_ne: -; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 -; CHECK-SSE-NEXT: retq -; -; CHECK-AVX-LABEL: test_urem_one_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq - %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1> - %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0> - %ret = zext <4 x i1> %cmp to <4 x i32> - ret <4 x i32> %ret -} diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll index 26b9e85feb2..1a16cf99b31 100644 --- a/llvm/test/CodeGen/X86/urem-seteq.ll +++ b/llvm/test/CodeGen/X86/urem-seteq.ll @@ -6,10 +6,6 @@ ; Odd divisors ;------------------------------------------------------------------------------; -; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. -; The corresponding pseudocode is: -; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] -; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] define i32 @test_urem_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_odd: ; X86: # %bb.0: @@ -104,12 +100,6 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; Even divisors ;------------------------------------------------------------------------------; -; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. -; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. -; The corresponding pseudocode is: -; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] -; Q <- [Q >>rot K] <=> [Q >>rot 1] -; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] define i16 @test_urem_even(i16 %X) nounwind { ; X86-LABEL: test_urem_even: ; X86: # %bb.0: @@ -239,11 +229,57 @@ define i32 @test_urem_odd_setne(i32 %X) nounwind { ret i32 %ret } +; The fold is only valid for positive divisors, negative-ones should be negated. +define i32 @test_urem_negative_odd(i32 %X) nounwind { +; X86-LABEL: test_urem_negative_odd: +; X86: # %bb.0: +; X86-NEXT: imull $858993459, {{[0-9]+}}(%esp), %ecx # imm = 0x33333333 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: seta %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_negative_odd: +; X64: # %bb.0: +; X64-NEXT: imull $858993459, %edi, %ecx # imm = 0x33333333 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $1, %ecx +; X64-NEXT: seta %al +; X64-NEXT: retq + %urem = urem i32 %X, -5 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} +define i32 @test_urem_negative_even(i32 %X) nounwind { +; X86-LABEL: test_urem_negative_even: +; X86: # %bb.0: +; X86-NEXT: imull $-920350135, {{[0-9]+}}(%esp), %ecx # imm = 0xC9249249 +; X86-NEXT: rorl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: seta %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_negative_even: +; X64: # %bb.0: +; X64-NEXT: imull $-920350135, %edi, %ecx # imm = 0xC9249249 +; X64-NEXT: rorl %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $1, %ecx +; X64-NEXT: seta %al +; X64-NEXT: retq + %urem = urem i32 %X, -14 + %cmp = icmp ne i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + ;------------------------------------------------------------------------------; ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. +; We can lower remainder of division by one much better elsewhere. define i32 @test_urem_one(i32 %X) nounwind { ; CHECK-LABEL: test_urem_one: ; CHECK: # %bb.0: @@ -255,46 +291,67 @@ define i32 @test_urem_one(i32 %X) nounwind { ret i32 %ret } -; We can lower remainder of division by all-ones much better elsewhere. -define i32 @test_urem_allones(i32 %X) nounwind { -; X86-LABEL: test_urem_allones: +; We can lower remainder of division by powers of two much better elsewhere. +define i32 @test_urem_pow2(i32 %X) nounwind { +; X86-LABEL: test_urem_pow2: ; X86: # %bb.0: -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: testb $15, {{[0-9]+}}(%esp) +; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: test_urem_allones: +; X64-LABEL: test_urem_pow2: ; X64: # %bb.0: -; X64-NEXT: negl %edi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: setb %al +; X64-NEXT: testb $15, %dil +; X64-NEXT: sete %al ; X64-NEXT: retq - %urem = urem i32 %X, 4294967295 + %urem = urem i32 %X, 16 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret } -; We can lower remainder of division by powers of two much better elsewhere. -define i32 @test_urem_pow2(i32 %X) nounwind { -; X86-LABEL: test_urem_pow2: +; The fold is only valid for positive divisors, and we can't negate INT_MIN. +define i32 @test_urem_int_min(i32 %X) nounwind { +; X86-LABEL: test_urem_int_min: ; X86: # %bb.0: ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testb $15, {{[0-9]+}}(%esp) +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: test_urem_pow2: +; X64-LABEL: test_urem_int_min: ; X64: # %bb.0: ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testb $15, %dil +; X64-NEXT: testl $2147483647, %edi # imm = 0x7FFFFFFF ; X64-NEXT: sete %al ; X64-NEXT: retq - %urem = urem i32 %X, 16 + %urem = urem i32 %X, 2147483648 + %cmp = icmp eq i32 %urem, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by all-ones much better elsewhere. +define i32 @test_urem_allones(i32 %X) nounwind { +; X86-LABEL: test_urem_allones: +; X86: # %bb.0: +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_allones: +; X64: # %bb.0: +; X64-NEXT: negl %edi +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: setb %al +; X64-NEXT: retq + %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 %ret = zext i1 %cmp to i32 ret i32 %ret |