diff options
| author | Artur Pilipenko <apilipenko@azulsystems.com> | 2016-12-13 14:21:14 +0000 |
|---|---|---|
| committer | Artur Pilipenko <apilipenko@azulsystems.com> | 2016-12-13 14:21:14 +0000 |
| commit | c93cc5955f5b93e12f52f50f3ed47ef48cc13726 (patch) | |
| tree | 73fa88187e34ffd17a694ce2125b15d11cdb06a4 /llvm/test/CodeGen/X86/load-combine.ll | |
| parent | 01e86444a0e5241b8be4b90dd041b2292035b75e (diff) | |
| download | bcm5719-llvm-c93cc5955f5b93e12f52f50f3ed47ef48cc13726.tar.gz bcm5719-llvm-c93cc5955f5b93e12f52f50f3ed47ef48cc13726.zip | |
[DAGCombiner] Match load by bytes idiom and fold it into a single load
Match a pattern where a wide type scalar value is loaded by several narrow loads and combined by shifts and ors. Fold it into a single load or a load and a bswap if the targets supports it.
Assuming little endian target:
i8 *a = ...
i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
=>
i32 val = *((i32)a)
i8 *a = ...
i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
=>
i32 val = BSWAP(*((i32)a))
This optimization was discussed on llvm-dev some time ago in "Load combine pass" thread. We came to the conclusion that we want to do this transformation late in the pipeline because in presence of atomic loads load widening is irreversible transformation and it might hinder other optimizations.
Eventually we'd like to support folding patterns like this where the offset has a variable and a constant part:
i32 val = a[i] | (a[i + 1] << 8) | (a[i + 2] << 16) | (a[i + 3] << 24)
Matching the pattern above is easier at SelectionDAG level since address reassociation has already happened and the fact that the loads are adjacent is clear. Understanding that these loads are adjacent at IR level would have involved looking through geps/zexts/adds while looking at the addresses.
The general scheme is to match OR expressions by recursively calculating the origin of individual bits which constitute the resulting OR value. If all the OR bits come from memory verify that they are adjacent and match with little or big endian encoding of a wider value. If so and the load of the wider type (and bswap if needed) is allowed by the target generate a load and a bswap if needed.
Reviewed By: hfinkel, RKSimon, filcab
Differential Revision: https://reviews.llvm.org/D26149
llvm-svn: 289538
Diffstat (limited to 'llvm/test/CodeGen/X86/load-combine.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/load-combine.ll | 733 |
1 files changed, 733 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll new file mode 100644 index 00000000000..ffcc0a9bb8d --- /dev/null +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -0,0 +1,733 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64 + +; i8* p; +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8(i32*) { +; CHECK-LABEL: load_i32_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i32 + %8 = shl nuw nsw i32 %7, 8 + %9 = or i32 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i32 + %13 = shl nuw nsw i32 %12, 16 + %14 = or i32 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i32 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i32 + %18 = shl nuw nsw i32 %17, 24 + %19 = or i32 %14, %18 + ret i32 %19 +} + +; i8* p; +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; i16* p; +; (i32) p[0] | ((i32) p[1] << 16) +define i32 @load_i32_by_i16(i32*) { +; CHECK-LABEL: load_i32_by_i16: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i16* + %3 = load i16, i16* %2, align 1 + %4 = zext i16 %3 to i32 + %5 = getelementptr inbounds i16, i16* %2, i32 1 + %6 = load i16, i16* %5, align 1 + %7 = zext i16 %6 to i32 + %8 = shl nuw nsw i32 %7, 16 + %9 = or i32 %8, %4 + ret i32 %9 +} + +; i16* p_16; +; i8* p_8 = (i8*) p_16; +; (i32) p_16[0] | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i16_i8(i32*) { +; CHECK-LABEL: load_i32_by_i16_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i16* + %3 = bitcast i32* %0 to i8* + %4 = load i16, i16* %2, align 1 + %5 = zext i16 %4 to i32 + %6 = getelementptr inbounds i8, i8* %3, i32 2 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = getelementptr inbounds i8, i8* %3, i32 3 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i32 + %13 = shl nuw nsw i32 %12, 24 + %14 = or i32 %9, %13 + %15 = or i32 %14, %5 + ret i32 %15 +} + + +; i8* p; +; (i32) ((i16) p[0] | ((i16) p[1] << 8)) | (((i32) ((i16) p[3] | ((i16) p[4] << 8)) << 16) +define i32 @load_i32_by_i16_by_i8(i32*) { +; CHECK-LABEL: load_i32_by_i16_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i16 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i16 + %8 = shl nuw nsw i16 %7, 8 + %9 = or i16 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i16 + %13 = getelementptr inbounds i8, i8* %2, i32 3 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i16 + %16 = shl nuw nsw i16 %15, 8 + %17 = or i16 %16, %12 + %18 = zext i16 %9 to i32 + %19 = zext i16 %17 to i32 + %20 = shl nuw nsw i32 %19, 16 + %21 = or i32 %20, %18 + ret i32 %21 +} + +; i8* p; +; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) +define i32 @load_i32_by_i16_by_i8_bswap(i32*) { +; CHECK-LABEL: load_i32_by_i16_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i16 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i16 + %8 = shl nuw nsw i16 %4, 8 + %9 = or i16 %8, %7 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i16 + %13 = getelementptr inbounds i8, i8* %2, i32 3 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i16 + %16 = shl nuw nsw i16 %12, 8 + %17 = or i16 %16, %15 + %18 = zext i16 %9 to i32 + %19 = zext i16 %17 to i32 + %20 = shl nuw nsw i32 %18, 16 + %21 = or i32 %20, %19 + ret i32 %21 +} + +; i8* p; +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8(i64*) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: .Lcfi0: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: .Lcfi2: +; CHECK-NEXT: .cfi_offset %esi, -12 +; CHECK-NEXT: .Lcfi3: +; CHECK-NEXT: .cfi_offset %edi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %eax +; CHECK-NEXT: movzbl 1(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movzbl 2(%ecx), %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movzbl 4(%ecx), %edx +; CHECK-NEXT: movzbl 5(%ecx), %esi +; CHECK-NEXT: shll $8, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 6(%ecx), %edi +; CHECK-NEXT: shll $16, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movzbl 7(%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: orl %edi, %edx +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i64_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movq (%rdi), %rax +; CHECK64-NEXT: retq + + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i64 + %5 = getelementptr inbounds i8, i8* %2, i64 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i64 + %8 = shl nuw nsw i64 %7, 8 + %9 = or i64 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i64 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i64 + %13 = shl nuw nsw i64 %12, 16 + %14 = or i64 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i64 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i64 + %18 = shl nuw nsw i64 %17, 24 + %19 = or i64 %14, %18 + %20 = getelementptr inbounds i8, i8* %2, i64 4 + %21 = load i8, i8* %20, align 1 + %22 = zext i8 %21 to i64 + %23 = shl nuw nsw i64 %22, 32 + %24 = or i64 %19, %23 + %25 = getelementptr inbounds i8, i8* %2, i64 5 + %26 = load i8, i8* %25, align 1 + %27 = zext i8 %26 to i64 + %28 = shl nuw nsw i64 %27, 40 + %29 = or i64 %24, %28 + %30 = getelementptr inbounds i8, i8* %2, i64 6 + %31 = load i8, i8* %30, align 1 + %32 = zext i8 %31 to i64 + %33 = shl nuw nsw i64 %32, 48 + %34 = or i64 %29, %33 + %35 = getelementptr inbounds i8, i8* %2, i64 7 + %36 = load i8, i8* %35, align 1 + %37 = zext i8 %36 to i64 + %38 = shl nuw i64 %37, 56 + %39 = or i64 %34, %38 + ret i64 %39 +} + +; i8* p; +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8_bswap(i64*) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Lcfi4: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Lcfi5: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: shll $16, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 3(%eax), %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 4(%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 5(%eax), %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: movzbl 6(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movzbl 7(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i64_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movq (%rdi), %rax +; CHECK64-NEXT: bswapq %rax +; CHECK64-NEXT: retq + + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i64 + %5 = shl nuw i64 %4, 56 + %6 = getelementptr inbounds i8, i8* %2, i64 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i64 + %9 = shl nuw nsw i64 %8, 48 + %10 = or i64 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i64 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i64 + %14 = shl nuw nsw i64 %13, 40 + %15 = or i64 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i64 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i64 + %19 = shl nuw nsw i64 %18, 32 + %20 = or i64 %15, %19 + %21 = getelementptr inbounds i8, i8* %2, i64 4 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i64 + %24 = shl nuw nsw i64 %23, 24 + %25 = or i64 %20, %24 + %26 = getelementptr inbounds i8, i8* %2, i64 5 + %27 = load i8, i8* %26, align 1 + %28 = zext i8 %27 to i64 + %29 = shl nuw nsw i64 %28, 16 + %30 = or i64 %25, %29 + %31 = getelementptr inbounds i8, i8* %2, i64 6 + %32 = load i8, i8* %31, align 1 + %33 = zext i8 %32 to i64 + %34 = shl nuw nsw i64 %33, 8 + %35 = or i64 %30, %34 + %36 = getelementptr inbounds i8, i8* %2, i64 7 + %37 = load i8, i8* %36, align 1 + %38 = zext i8 %37 to i64 + %39 = or i64 %35, %38 + ret i64 %39 +} + +; Part of the load by bytes pattern is used outside of the pattern +; i8* p; +; i32 x = (i32) p[1] +; res = ((i32) p[0] << 24) | (x << 16) | ((i32) p[2] << 8) | (i32) p[3] +; x | res +define i32 @load_i32_by_i8_bswap_uses(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_uses: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Lcfi6: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Lcfi7: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movzbl 3(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_uses: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: movl %ecx, %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %eax, %edx +; CHECK64-NEXT: movzbl 2(%rdi), %esi +; CHECK64-NEXT: shll $8, %esi +; CHECK64-NEXT: orl %edx, %esi +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %esi, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ; Use individual part of the pattern outside of the pattern + %20 = or i32 %8, %19 + ret i32 %20 +} + +; One of the loads is volatile +; i8* p; +; p0 = volatile *p; +; ((i32) p0 << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap_volatile(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_volatile: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: shll $16, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 3(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_volatile: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load volatile i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; There is a store in between individual loads +; i8* p, q; +; res1 = ((i32) p[0] << 24) | ((i32) p[1] << 16) +; *q = 0; +; res2 = ((i32) p[2] << 8) | (i32) p[3] +; res1 | res2 +define i32 @load_i32_by_i8_bswap_store_in_between(i32*, i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Lcfi8: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Lcfi9: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: movzbl 1(%ecx), %esi +; CHECK-NEXT: movl $0, (%eax) +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 2(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %esi, %edx +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: movl $0, (%rsi) +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %3 = bitcast i32* %0 to i8* + %4 = load i8, i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 24 + %7 = getelementptr inbounds i8, i8* %3, i32 1 + %8 = load i8, i8* %7, align 1 + ; This store will prevent folding of the pattern + store i32 0, i32* %1 + %9 = zext i8 %8 to i32 + %10 = shl nuw nsw i32 %9, 16 + %11 = or i32 %10, %6 + %12 = getelementptr inbounds i8, i8* %3, i32 2 + %13 = load i8, i8* %12, align 1 + %14 = zext i8 %13 to i32 + %15 = shl nuw nsw i32 %14, 8 + %16 = or i32 %11, %15 + %17 = getelementptr inbounds i8, i8* %3, i32 3 + %18 = load i8, i8* %17, align 1 + %19 = zext i8 %18 to i32 + %20 = or i32 %16, %19 + ret i32 %20 +} + +; One of the loads is from an unrelated location +; i8* p, q; +; ((i32) p[0] << 24) | ((i32) q[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap_unrelated_load(i32*, i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: movzbl 1(%eax), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movzbl 2(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rsi), %ecx +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %3 = bitcast i32* %0 to i8* + %4 = bitcast i32* %1 to i8* + %5 = load i8, i8* %3, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 24 + ; Load from an unrelated address + %8 = getelementptr inbounds i8, i8* %4, i32 1 + %9 = load i8, i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %11, %7 + %13 = getelementptr inbounds i8, i8* %3, i32 2 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw nsw i32 %15, 8 + %17 = or i32 %12, %16 + %18 = getelementptr inbounds i8, i8* %3, i32 3 + %19 = load i8, i8* %18, align 1 + %20 = zext i8 %19 to i32 + %21 = or i32 %17, %20 + ret i32 %21 +} + +; Non-zero offsets are not supported for now +; i8* p; +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_unsupported_offset(i32*) { +; CHECK-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: movzbl 2(%eax), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 3(%eax), %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 4(%eax), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: movzbl 2(%rdi), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 3(%rdi), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 4(%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = getelementptr inbounds i8, i8* %2, i32 1 + %4 = load i8, i8* %2, align 1 + %5 = zext i8 %4 to i32 + %6 = getelementptr inbounds i8, i8* %2, i32 2 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 8 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 3 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 16 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 4 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = shl nuw nsw i32 %18, 24 + %20 = or i32 %15, %19 + ret i32 %20 +} + +; i8* p; i32 i; +; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3] +define i32 @load_i32_by_i8_bswap_base_index_offset(i32*, i32) { +; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%ecx,%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movslq %esi, %rax +; CHECK64-NEXT: movzbl (%rdi,%rax), %ecx +; CHECK64-NEXT: shll $24, %ecx +; CHECK64-NEXT: movzbl 1(%rdi,%rax), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 2(%rdi,%rax), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %edx, %ecx +; CHECK64-NEXT: movzbl 3(%rdi,%rax), %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: retq +; Currently we don't fold the pattern for x86-64 target because we don't see +; that the loads are adjacent. It happens because BaseIndexOffset doesn't look +; through zexts. + + %3 = bitcast i32* %0 to i8* + %4 = getelementptr inbounds i8, i8* %3, i32 %1 + %5 = load i8, i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 24 + %8 = add nuw nsw i32 %1, 1 + %9 = getelementptr inbounds i8, i8* %3, i32 %8 + %10 = load i8, i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 16 + %13 = or i32 %12, %7 + %14 = add nuw nsw i32 %1, 2 + %15 = getelementptr inbounds i8, i8* %3, i32 %14 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i32 + %18 = shl nuw nsw i32 %17, 8 + %19 = or i32 %13, %18 + %20 = add nuw nsw i32 %1, 3 + %21 = getelementptr inbounds i8, i8* %3, i32 %20 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i32 + %24 = or i32 %19, %23 + ret i32 %24 +} |

