Am 28.12.2016 um 11:55 schrieb Rudy Velthuis (TeamB):

Michael Rabatscher wrote:

the compiler wouldn't allow [eax + 3*esi]... so... is there

a neat trick to do that...

kind regards

Mike

Did you try:

[eax + 2*esi + esi]

That is what I would do. I'm not sure if it works, though.

Normally, in an unrolled loop, you have a fixed size (ideally, 1, 2, 4

or 8), so you do

[eax + size*esi]

...

[eax + size*esi + size] // next element

...

[eax + size*esi + 2*size] // next element

...

[eax + size*esi + 3*size] // next element

...

add esi,4 // unroll number.

that doesn't work:

movsd xmm5, [eax + 2*esi + esi];

sorry...

So.. basically the memory layout is the following:

input matrix with row major memory layout and an input vector with

arbitrary element displacement

The result is a matrix vector operation where eax points to the first

Row, eax + esi to the next row and so forth...

it's actually a smaller optimization to the following loops:

for i := 0 to Height div 4 - 1 do

begin

res1 := 0;

res2 := 0;

res3 := 0;

res4 := 0;

pMt1 := PConstDoubleArr(mt1);

inc(PByte(Mt1), LineWidthMT);

pMt11 := PConstDoubleArr(mt1);

inc(pByte(MT1), LineWidthMT);

pMt12 := PConstDoubleArr(mt1);

inc(pByte(MT1), LineWidthMT);

pMt13 := PConstDoubleArr(mt1);

inc(pByte(MT1), LineWidthMT);

pV := v;

for j := 0 to Width - 1 do

begin

res1 := res1 + pV^*pMt1^[j];

res2 := res2 + pV^*pMt11^[j];

res3 := res3 + pV^*pMt12^[j];

res4 := res4 + pV^*pMt13^[j];

inc(PByte(pV), LineWidthV);

end;

dest^ := beta*dest^ + alpha*res1;

inc(PByte(dest), destLineWidth);

dest^ := beta*dest^ + alpha*res2;

inc(PByte(dest), destLineWidth);

dest^ := beta*dest^ + alpha*res3;

inc(PByte(dest), destLineWidth);

dest^ := beta*dest^ + alpha*res4;

inc(PByte(dest), destLineWidth);

end;

so actually the idea is to load only one vector element and

use it for at least 4 rows that is indeed way faster

than the original non unrolled loop.

It is actually way faster than doing a loop like this.

Idea: load two vector elements and multiply it with two elements in one

row only:

procedure ASMMatrixVectMultAlignedEvenW(dest : PDouble; destLineWidth :

TASMNativeInt; mt1, v : PDouble; LineWidthMT, LineWidthV :

TASMNativeInt; width, height : TASMNativeInt; alpha, beta : double);

var iter : TASMNativeInt;

begin

assert( (width and $01) = 0, 'Even width expected');

assert((Cardinal(mt1) and $0000000F) = 0, 'Error aligned

operations cannot be performed');

assert(((Cardinal(LineWidthMT) and $0000000F) = 0), 'Error cannot

perform aligned operations');

assert(Width > 1, 'Error at least a vector with two elements

expected');

iter := -width*sizeof(double);

asm

push eax;

push edx;

push ebx;

push esi;

push edi;

xorpd xmm7, xmm7;

mov edi, dest;

// for the final multiplication

movhpd xmm6, beta;

movlpd xmm6, alpha;

// prepare for "Reverse loop indexing"

mov eax, mt1;

sub eax, iter;

mov edx, LineWidthV;

// init for y := 0 to height - 1:

mov esi, height;

@@foryloop:

// init values:

xorpd xmm0, xmm0; // dest^ := 0;

mov ebx, v; // ebx = first vector element

mov ecx, iter;

@@forxloop:

movlpd xmm1, [ebx]

movhpd xmm1, [ebx + edx];

mulpd xmm1, [eax + ecx];

addpd xmm0, xmm1;

add ebx, edx;

add ebx, edx;

add ecx, 16;

jnz @@forxloop;

// result building

// write back result (final addition and compactation)

haddpd xmm0, xmm7;

// calculate dest = beta*dest + alpha*xmm0

movhpd xmm0, [edi];

mulpd xmm0, xmm6;

haddpd xmm0, xmm7;

movlpd [edi], xmm0;

// next results:

add edi, destLineWidth;

add eax, LineWidthMT;

dec esi;

jnz @@foryloop;

// epilog

pop edi;

pop esi;

pop ebx;

pop edx;

pop eax;

end;

end;

The movhpd and movlpd commands are waaaay slower than single

movs.

kind regards

Mike

Or you do add esi*size*4 at the end of the loop to eax and only use

eax, eax + size, eax + 2*size, eax + 3*size.

Your loop looks a little suspicious. Say, esi = 17, then you are

accessing eax + 17, eax + 34 and eax + 51. Is that really what you want?

Is this a graphics thingie? If so, then work line by line, not column

by column.