SIMD是(Single Instrument Multi Data),MMX实现了SIMD;SSE是(Streaming SIMD Extension),它取代了MMX;后来AVX(Advanced Vector Extension,高级向量扩展)对SSE进行了扩展。如下代码展示了SSE处理未对齐内存的情况:
; sse_unaligned.asm
extern printf
section .dataspvector1   dd  1.1dd  2.2dd  3.3dd  4.4spvector2   dd  1.1dd  2.2dd  2.2dd  3.3dpvector1   dq  1.1dq  2.2dpvector2   dq  3.3dq  4.4fmt1        db  "Single Precision Vector 1: %f, %f, %f, %f", 10, 0fmt2        db  "Single Precision Vector 2: %f, %f, %f, %f", 10, 0fmt3        db  "Sum of Single Precision Vector 1 and Vector 2: %f, %f, %f %f", 10, 0fmt4        db  "Doule Precision Vector 1: %f, %f", 10, 0fmt5        db  "Doule Precision Vector 2: %f, %f", 10, 0fmt6        db  "Sum of Double Precision Vector 1 and Vector 2: %f, %f", 10, 0section .bssspvector_res resd 4dpvector_res resq 4
section .textglobal main
main:
push rbp
mov rbp, rspmov     rsi, spvector1mov     rdi, fmt1call    printspfpmov     rsi, spvector2mov     rdi, fmt2call    printspfpmovups  xmm0, [spvector1]movups  xmm1, [spvector2]addps   xmm0, xmm1movups  [spvector_res], xmm0mov     rsi, spvector_resmov     rdi, fmt3call    printspfpmov     rsi, dpvector1mov     rdi, fmt4call    printdpfpmov     rsi, dpvector2mov     rdi, fmt5call    printdpfpmovupd  xmm0, [dpvector1]movupd  xmm1, [dpvector2]addpd   xmm0, xmm1movupd  [dpvector_res], xmm0mov     rsi, dpvector_resmov     rdi, fmt6call    printdpfp
leave
retprintspfp:
push rbp
mov rbp, rspmovss       xmm0, [rsi]cvtss2sd    xmm0, xmm0movss       xmm1, [rsi+4]cvtss2sd    xmm1, xmm1movss       xmm2, [rsi+8]cvtss2sd    xmm2, xmm2movss       xmm3, [rsi+12]cvtss2sd    xmm3, xmm3mov         rax, 4call        printf
leave
retprintdpfp:
push rbp
mov rbp, rspmovsd   xmm0, [rsi]movsd   xmm1, [rsi+8]mov     rax, 2call    printf
leave
ret
需要注意的几个指令如下:
 movups: 移动未对齐的打包单精度;(u:未对齐unaligned;p:打包的packed;s:单精度single;)
 addps: 打包单精度相加;
 movss: 移动标量单精度;(s:标量scalar;s:单精度single)
 cvtss2sd: 将标量单精度转换为标量双精度;(d:双精度double)