diff options
| author | Craig Topper <craig.topper@gmail.com> | 2016-07-31 20:20:05 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@gmail.com> | 2016-07-31 20:20:05 +0000 |
| commit | 7afdc0fb25e04f3d824fd81c8872cd57daab0b7b (patch) | |
| tree | 850042a35f07bafa03fb4ff3630842c3dccb9975 /llvm | |
| parent | 4c53e603604fc77c74efe8a2eb4c8e0b3f3323c4 (diff) | |
| download | bcm5719-llvm-7afdc0fb25e04f3d824fd81c8872cd57daab0b7b.tar.gz bcm5719-llvm-7afdc0fb25e04f3d824fd81c8872cd57daab0b7b.zip | |
[AVX512] Always use EVEX encodings for 128/256-bit move instructions in getLoadStoreRegOpcode if VLX is supported.
llvm-svn: 277305
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 39 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-half-conversions.ll | 278 |
3 files changed, 155 insertions, 163 deletions
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index f8240f5e939..dda11809208 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4840,6 +4840,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool load) { bool HasAVX = STI.hasAVX(); bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); if (HasAVX512 && isMaskRegClass(RC)) return getLoadStoreMaskRegOpcode(RC, load); @@ -4884,38 +4885,28 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert((X86::VR128RegClass.hasSubClassEq(RC) || - X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); + assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR128RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) - : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); - else - return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) - : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr; + return load ? + (HasVLX ? X86::VMOVAPSZ128rm : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); else - return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr; + return load ? + (HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } case 32: - assert((X86::VR256RegClass.hasSubClassEq(RC) || - X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); + assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR256RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; - else - return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr; + return load ? + (HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm) : + (HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr); else - return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr; + return load ? + (HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm) : + (HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr); case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll index 150057e42f4..016946edfb2 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512vl -mattr=+avx512vbmi --show-mc-encoding| FileCheck %s declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index b1a2121fab2..aeb93a217d1 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -137,16 +137,16 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrq $48, %rdi ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -258,16 +258,16 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrq $48, %rdi ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -458,34 +458,34 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrq $48, %rdi ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r14w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r13d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r12w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r15w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl {{[0-9]+}}(%rsp), %edi # 4-byte Reload ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload @@ -840,64 +840,64 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrq $48, %rdi ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r12w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r13w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r15w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl {{[0-9]+}}(%rsp), %edi # 4-byte Reload ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl {{[0-9]+}}(%rsp), %edi # 4-byte Reload ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl {{[0-9]+}}(%rsp), %edi # 4-byte Reload ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload @@ -905,14 +905,14 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1,2],mem[0] ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1],mem[0],xmm1[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload @@ -1035,16 +1035,16 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { ; AVX512VL-NEXT: movq %rdi, %rbx ; AVX512VL-NEXT: movzwl 6(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 4(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl (%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 2(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1156,16 +1156,16 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: shrq $48, %rdi ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1296,34 +1296,34 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: movq %rdi, %rbx ; AVX512VL-NEXT: movzwl 6(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 4(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl (%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 2(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 14(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 12(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 8(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 10(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload @@ -1547,58 +1547,58 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX512VL-NEXT: movq %rdi, %rbx ; AVX512VL-NEXT: movzwl 6(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 4(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl (%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 2(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 14(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 12(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 8(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 10(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 22(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 20(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 16(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 18(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 30(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 28(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 24(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl 26(%rbx), %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm0 = xmm0[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload @@ -1606,14 +1606,14 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1,2],mem[0] ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1],mem[0],xmm1[3] ; AVX512VL-NEXT: vinsertps $48, {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm1 = xmm1[0,1,2],mem[0] -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload ; AVX512VL-NEXT: vinsertps $16, {{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512VL-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps $32, {{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload @@ -1735,7 +1735,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebx ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -1857,13 +1857,13 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebp ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -1943,7 +1943,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebx ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -2064,13 +2064,13 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebp ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -2284,25 +2284,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebp ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r14w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r13w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r15d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %r12w, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> ; AVX512VL-NEXT: callq __gnu_h2f_ieee @@ -2658,13 +2658,13 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: shrl $16, %ebp ; AVX512VL-NEXT: movzwl %ax, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %ebp, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movzwl %bx, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee -; AVX512VL-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: movl %r14d, %edi ; AVX512VL-NEXT: callq __gnu_h2f_ieee ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -2988,7 +2988,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx @@ -2997,12 +2997,12 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3100,7 +3100,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx @@ -3109,12 +3109,12 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3216,7 +3216,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx @@ -3225,12 +3225,12 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3400,7 +3400,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $32, %rsp -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx @@ -3410,12 +3410,12 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d -; AVX512VL-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r14d @@ -3433,12 +3433,12 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -3677,12 +3677,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> @@ -3720,22 +3720,22 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> ; AVX512VL-NEXT: movl %eax, {{[0-9]+}}(%rsp) # 4-byte Spill -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r13w -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r14w -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r15w @@ -3743,7 +3743,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %r12w -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<def> @@ -3884,15 +3884,15 @@ define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $24, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r15d -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %ebp @@ -4000,7 +4000,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $16, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp @@ -4009,12 +4009,12 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -4124,7 +4124,7 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $16, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp @@ -4133,12 +4133,12 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movzwl %ax, %eax @@ -4282,15 +4282,15 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $72, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r12d @@ -4300,11 +4300,11 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r13d -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %ebp -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d @@ -4568,11 +4568,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill @@ -4582,11 +4582,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill @@ -4596,11 +4596,11 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r14d -; AVX512VL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: callq __gnu_f2h_ieee ; AVX512VL-NEXT: movl %eax, %r15d @@ -4729,7 +4729,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $16, %rsp -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -4861,7 +4861,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5011,7 +5011,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5165,7 +5165,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx @@ -5554,7 +5554,7 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $24, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa64 %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %ebp @@ -5692,7 +5692,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $88, %rsp ; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r14d @@ -5855,7 +5855,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $32, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp @@ -6026,7 +6026,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: pushq %rbx ; AVX512VL-NEXT: subq $32, %rsp ; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp |

