/* Copyright (C) 2009 - Jean-Michel Richer - 
   LERIA - UFR Sciences - 2 bd Lavoisier - 49045 Angers Cedex 1

nasm -f elf prod_mat_vec_nasm.asm
gcc -Wall -O2 -o prod_mat_vec.exe prod_mat_vec.c prod_mat_vec_nasm.o -msse2

attention: avec gcc 4.4.1
gcc -Wall -O3 -o prod_mat_vec.exe prod_mat_vec.c prod_mat_vec_nasm.o -msse3
donne de meilleurs résultats:

q6600 @ 2.4 Ghz
           1024 2048   3092  4096
------------------------------------------------
test1 (-O2) 25s   51s 1m17s 1m42s
test1 (-O3) 13    27s   41s   54s
test2       20s   42s 1m01s 1m23s
test3       10s   20s   30s   40s

q9300 @ 2.5 Ghz
           1024 2048   3092  4096
------------------------------------------------
test1 (-O2) 23s  53s  1m19s 1m43s
test1 (-O3) 13s  30s    38s   53s
test2       13s  32s    47s 1m06s
test3       9s   18s    29s   35s

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>

#define SIZE  1024
#define LOOPS 1000000

extern void prod_sse(float *m, float *v, float *w);
extern void prod_sse_transpose(float *m, float *v, float *w);

float matrix[] __attribute__((aligned(16))) = {
  1.0, 2.0, 3.0, 4.0,
  -1.0, 2.0, -3.0, 4.0,
  1.0, -2.0, 3.0, -4.0,
  -1.0, -2.0, -3.0, -4.0
};

float vector[]  __attribute__((aligned(16)))= {
  1.0, 2.0, 3.0, 4.0
};

float result[4]  __attribute__((aligned(16)));

float *matrices;

void init_matrices() {
  int i;

  matrices=(float *)_mm_malloc(SIZE*sizeof(float)*16,16);
  for (i=0;i<SIZE;++i) {
    memcpy((void *)&matrices[16*i], (void *)&matrix, 16*sizeof(float));
  }
}

void transpose(float *m) {
  int i,j;

  for (i=0;i<4;++i) {
    for (j=i+1;j<4;++j) {
      float tmp=m[i*4+j];
      m[i*4+j]=m[j*4+i];
      m[j*4+i]=tmp;
    }
  }
}

void display_matrix(float *m) {
  int i, j;

  for (i=0;i<4;++i) {
    for (j=0;j<4;++j) {
      printf("%5.2f ", m[i*4+j]);
    }
    printf("\n");
  }
}

void display_vector(float *m) {
  int i;

  for (i=0;i<4;++i) {
    printf("%5.2f ", m[i]);
  }
  printf("\n");
}



void prod(float *m, float *v, float *w) {
  int i,j;

  for (i=0;i<4;++i) {
    float s=0.0;
    for (j=0;j<4;++j) {
      s+=m[i*4+j]*v[j];
    }
    w[i]=s;
  }
}

void test1() {
  int i;

  init_matrices();
  for (i=0;i<LOOPS;++i) {
    int j;
    for (j=0;j<SIZE;++j) {
      prod(&matrices[16*j],vector,result);
    }
  }
}

void test2() {
  int i;

  init_matrices();
  for (i=0;i<LOOPS;++i) {
    int j;
    for (j=0;j<SIZE;++j) {
      prod_sse(&matrices[16*j],vector,result);
    }
  }  
}

void test3() {
  int i;

  transpose(&matrix[0]);
  init_matrices();
  for (i=0;i<LOOPS;++i) {
    int j;
    for (j=0;j<SIZE;++j) {
      prod_sse_transpose(&matrices[16*j],vector,result);
    }
  }  
}


void test10() {

  printf("prod\n");
  prod(matrix,vector,result);
  display_matrix(&matrix[0]);
  display_vector(&result[0]);

  printf("prod_sse\n");
  prod_sse(matrix,vector,result);
  display_matrix(&matrix[0]);
  display_vector(&result[0]);

  transpose(&matrix[0]);
  prod_sse_transpose(matrix,vector,result);
  display_matrix(&matrix[0]);
  display_vector(&result[0]);
}



int main(int argc, char *argv[]) {

  switch(atoi(argv[1])) {
  case 1: test1(); break;
  case 2: test2(); break;
  case 3: test3(); break;
  case 10: test10(); break;
  };

  return 0;
}
