padd // wrap around and sat
psub
pcmpeq
pcmgt
pmul
pmadd // mul and add adj pair
psra
psrl
punpck // merge pack
packss // pack sat
pand
pandn
por
pxor
mov
A3 A2 A1 A0
x
B3 B2 B1 B0A3*B3 A2*B2 A1*B1 A0*B0
A3*B3+A2*B2 A1*B1+A0*B0
pcmpgt
51 03 05 23punpck
>
73 02 05 060 1 0 1
. . . . B3 B2 B1 B0 . . . . A3 A2 A1 A0B3 A3 B2 A2 B1 A1 B0 A0
vec[.]
mat[.][.]
res[.]for i=0; i<COL; i++
a = 0
for j=0; j<ROW; j++
a = a + vec[j] * mat[j][i]
res[i] = a
mul_vec_mat 4 x 2 in MMX
|
|
punpck mm6, mm0 | EFGH ABCD
CGDH |
punpck mm1, mm7 | EFGH ABCD
AEBF |
pmadd mm6, mm7 | AEBF V0V1V0V1 A*V0+E*V1 B*V0+F*V1 |
pmadd mm1, mm7 | CGDH V0V1V0V1 C*V0+G*V1 D*V0+H*V1 |
padd mm2, mm1 | mm2 (a) = A*V0+E*V1 B*V0+F*V1 |
padd mm3 ,mm6 | mm3 (a) = C*V0+G*V1 D*V0+H*V1 |
|
|
|
vector dot product
input vector: 16 elements, signed 16-bit accum 32 bits |
32 loads
16 muls 15 adds 4 MAC per loop: 12 inst. loop control (3 inst. per iter., inc, cmp, branch) 1 inst. store result -------------- total 76 inst. (opt. 200 cycles) |
8 loads
4 pmadds 3 padds four loop unroll (no branch) --------- total 19 inst. (opt. 12 cycles) |
matrix mul 16 x 16 | 4236 cycles | 367 cycles |
for i=0; i<size; i++
if x[i] == Blue then z = y[i]
else z = x[i]
x is the image with a blue background
y is the background image
|
|
mov mm3, x | |
mov mm4, y | |
pcmpeg mm1, mm3 | mm1 B B B B B B B B
mm3 G R B B G R B B mm1 0 0 1 1 0 0 1 1 |
pand mm4, mm1 | mm4 Y Y Y Y Y Y Y Y
mm1 0 0 1 1 0 0 1 1 mm4 0 0 Y Y 0 0 Y Y |
pandn mm1, mm3 | mm1 0 0 1 1 0 0 1 1
mm3 X X X X X X X X mm1 X X 0 0 X X 0 0 |
por mm4, mm1 | mm4 X X Y Y X X Y Y |