diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 71 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-bitreverse.ll | 186 |
2 files changed, 256 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 53483922257..f58b4dcff55 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1068,6 +1068,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, MVT::v16i16, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + + // XOP can efficiently perform BITREVERSE with VPPERM. + setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); + + setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v8i16, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v16i16, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v8i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v4i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { @@ -20699,6 +20714,59 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, return LowerVectorCTPOP(Op, Subtarget, DAG); } +static SDValue LowerBITREVERSE(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + SDLoc DL(Op); + + // For scalars, its still beneficial to transfer to/from the SIMD unit to + // perform the BITREVERSE. + if (!VT.isVector()) { + MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); + SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); + Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + + MVT SVT = VT.getVectorElementType(); + int NumElts = VT.getVectorNumElements(); + int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector()) { + SDValue Lo = extract128BitVector(In, 0, DAG, DL); + SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); + + MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo), + DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi)); + } + + assert(VT.is128BitVector() && + "Only 128-bit vector bitreverse lowering supported."); + + // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we + // perform the BSWAP in the shuffle. + // Its best to shuffle using the second operand as this will implicitly allow + // memory folding for multiple vectors. + SmallVector<SDValue, 16> MaskElts; + for (int i = 0; i != NumElts; ++i) { + for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { + int SourceByte = 16 + (i * ScalarSizeInBytes) + j; + int PermuteByte = SourceByte | (2 << 5); + MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); + } + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, MaskElts); + SDValue Res = DAG.getBitcast(MVT::v16i8, In); + Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), + Res, Mask); + return DAG.getBitcast(VT, Res); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { unsigned NewOpc = 0; switch (N->getOpcode()) { @@ -21196,7 +21264,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::BITREVERSE: return LowerBITREVERSE(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll new file mode 100644 index 00000000000..c5b830001a0 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 + +define i8 @test_bitreverse_i8(i8 %a) { +; ALL-LABEL: test_bitreverse_i8: +; ALL: # BB#0: +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vpextrb $0, %xmm0, %eax +; ALL-NEXT: retq + %b = call i8 @llvm.bitreverse.i8(i8 %a) + ret i8 %b +} + +define i16 @test_bitreverse_i16(i16 %a) { +; ALL-LABEL: test_bitreverse_i16: +; ALL: # BB#0: +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: retq + %b = call i16 @llvm.bitreverse.i16(i16 %a) + ret i16 %b +} + +define i32 @test_bitreverse_i32(i32 %a) { +; ALL-LABEL: test_bitreverse_i32: +; ALL: # BB#0: +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: retq + %b = call i32 @llvm.bitreverse.i32(i32 %a) + ret i32 %b +} + +define i64 @test_bitreverse_i64(i64 %a) { +; ALL-LABEL: test_bitreverse_i64: +; ALL: # BB#0: +; ALL-NEXT: vmovq %rdi, %xmm0 +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, %rax +; ALL-NEXT: retq + %b = call i64 @llvm.bitreverse.i64(i64 %a) + ret i64 %b +} + +define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) { +; ALL-LABEL: test_bitreverse_v16i8: +; ALL: # BB#0: +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) + ret <16 x i8> %b +} + +define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) { +; ALL-LABEL: test_bitreverse_v8i16: +; ALL: # BB#0: +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) + ret <8 x i16> %b +} + +define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) { +; ALL-LABEL: test_bitreverse_v4i32: +; ALL: # BB#0: +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) + ret <4 x i32> %b +} + +define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) { +; ALL-LABEL: test_bitreverse_v2i64: +; ALL: # BB#0: +; ALL-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) + ret <2 x i64> %b +} + +define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) { +; XOPAVX1-LABEL: test_bitreverse_v32i8: +; XOPAVX1: # BB#0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] +; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: test_bitreverse_v32i8: +; XOPAVX2: # BB#0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] +; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) + ret <32 x i8> %b +} + +define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) { +; XOPAVX1-LABEL: test_bitreverse_v16i16: +; XOPAVX1: # BB#0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] +; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: test_bitreverse_v16i16: +; XOPAVX2: # BB#0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] +; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) + ret <16 x i16> %b +} + +define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) { +; XOPAVX1-LABEL: test_bitreverse_v8i32: +; XOPAVX1: # BB#0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] +; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: test_bitreverse_v8i32: +; XOPAVX2: # BB#0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] +; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) + ret <8 x i32> %b +} + +define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) { +; XOPAVX1-LABEL: test_bitreverse_v4i64: +; XOPAVX1: # BB#0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] +; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: test_bitreverse_v4i64: +; XOPAVX2: # BB#0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] +; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) + ret <4 x i64> %b +} + +declare i8 @llvm.bitreverse.i8(i8) readnone +declare i16 @llvm.bitreverse.i16(i16) readnone +declare i32 @llvm.bitreverse.i32(i32) readnone +declare i64 @llvm.bitreverse.i64(i64) readnone + +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone +declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone +declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone +declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone + +declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone +declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone +declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone +declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone |