並列化よりも、まず最適化オプション

円を塗りつぶす処理を並列化してみたけど、ぜんぜん速度向上につながらない。よく考えてみると、並列化したループは、せいぜい50〜60回のものでしかないから、スレッド生成の準備とかオーバーヘッドで相殺されてるのかも。

それより、gccの最適化オプションを変えたら劇的に高速化して驚いた。今まで何やってたんだ……。

「-O3」は効果の割に、あまりテストされてないケースが多いので、一般に使わないとか。

「-O2」と「-O2 -march=native」でも、これほど違うってことは、色々と自分でビルドしたほうが快適になるってことかしら? と思ってあれこれ調べたら、どうもバイナリサイズが大きなアプリケーションではディスクI/Oのオーバーヘッドが大きいので、サイズ最適化の-Osオプションのほうが速くなるものらしい。Mozillaのページには、経験的にモジュール単位で最適化オプションを選んでいると書いてあった。

もう少しgccのオプション、特にSIMD系の命令を活用したループのベクトル化あたりが気になって、

-ftree-vectorize -ftree-vectorizer-verbose=5 

とかやってみたけど、今の例では

paint.c:93: note: not vectorized: number of iterations cannot be computed.
paint.c:61: note: not vectorized: unhandled data-ref 
paint.c:44: note: not vectorized: unhandled data-ref 
:
:

とかで何ひとつ最適化されなかった。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <pthread.h>
#include <unistd.h>

# define NUMBER_OF_CIRCLES 200
# define GEN1 10000
# define GEN2 10000
# define GEN3 10000
# define BGCOLOR 0x00ffffff

typedef struct {
  int height;
  int width;
  int **raw;
} buf;

typedef struct {
  int x;
  int y;
  int rad;
  int col;
  int alpha;
} circle;

typedef struct {
  buf *buf1;
  buf *buf2;
  double res;
} diff_args;

typedef struct {
  buf *buf;
  circle *cir;
  int begin;
} cir_args;

int clear_buf (buf *b) {
  int i, j;
  for (i = 0; i < b->height; i++) {
    for (j = 0; j < b->width; j++) {
      b->raw[i][j] = BGCOLOR;
    }
  }
  return 0;
}

buf *init_buf (int width, int height) {
  buf *b;
  int i;

  b = malloc(sizeof(buf));
  b->height = height;
  b->width =width;

  b->raw = malloc(sizeof(int*) * height); 
  b->raw[0] = malloc(sizeof(int) * width * height); 
  for (i = 1; i < height; i ++) {
    b->raw[i] = b->raw[0] + (width * i);
  }

  clear_buf(b);
  return b;
}

buf *load_buf (char *filename) {
  FILE *fp;
  char s[255], size[255];
  int i, j;
  int width, height;
  buf *b;

  if ((fp = fopen(filename, "r")) == NULL) {
      printf("Can't open the file\n");
      exit(1);
  }

  // ppm header

  fgets(s,      255, fp); // P6
  fgets(size,   255, fp); // 320 240
  fgets(s,      255, fp); // 255

  width = atoi(size);
  height = atoi(strstr(size, " "));

  b = init_buf(width, height);

  for (i = 0; i < b->height; i++) {
    for (j = 0; j < b->width; j++) {
      fread(&b->raw[i][j], sizeof(char), 3, fp);
    }
  }
  fclose(fp);
  return b;
}

int save_buf (buf *b, char *filename) {
  FILE *fp;
  int i, j;

  if ((fp = fopen(filename, "w")) == NULL) {
      printf("Can't open the file\n");
      exit(1);
  }

  fprintf(fp, "P6\n%d %d\n255\n", b->width, b->height);

  for (i = 0; i < b->height; i++) {
    for (j = 0; j < b->width; j++) {
      fwrite(&b->raw[i][j], sizeof(char), 3, fp);
    }
  }
  fclose(fp);
  return 0;
}

int p_fill_circle (void *p) {
  cir_args *args;
  buf *buf;
  circle *cir;

  int i, j;
  int x, y, rad;
  int upper, lower, left, right;
  float back, fore;
  unsigned char r, g, b;
  unsigned char r2, g2, b2;

  args = (cir_args *)p;
  buf = args->buf;
  cir = args->cir;

  if ((cir->alpha < 0) || (cir->alpha >100)) {
      return 1;
  }

  r2 = cir->col & 0x000000ff;
  g2 = (cir->col & 0x0000ff00) >>8;
  b2 = (cir->col & 0x00ff0000) >> 16;

  fore = (float)cir->alpha / 100;
  back = (100 - (float)cir->alpha) / 100;

  x = cir->x; y = cir->y; rad = cir->rad;

  upper = (y - rad) > 0 ? (y - rad) : 0;
  lower = (y + rad) > buf->height ? buf->height : (y + rad);
  left  = (x - rad) > 0 ? (x - rad) : 0;
  right = (x + rad) > buf->width ? buf->width : (x + rad);

  for (i = upper + args->begin; i < lower; i+=2) {
    for (j = left; j < right; j++) {
      if ((rad*rad - (y-i)*(y-i) - (x-j)*(x-j)) > 0) { 
	r = buf->raw[i][j] & 0x000000ff;
	g = (buf->raw[i][j] & 0x0000ff00) >> 8;
	b = (buf->raw[i][j] & 0x00ff0000) >> 16;
	r = (r2 * fore) + (r * back);
	g = (g2 * fore) + (g * back);
	b = (b2 * fore) + (b * back);
	buf->raw[i][j] = (b << 16) + (g << 8) + r;
      }
    }
  }
  pthread_exit(NULL);
}

int fill_circle (buf *buf, circle *cir) {
  pthread_t thread1, thread2;
  cir_args *args;
  args = malloc(sizeof(cir_args));

  args->buf = buf;
  args->cir = cir;
  args->begin = 0;
  pthread_create(&thread1, NULL, p_fill_circle, (void *)args);
  args->begin = 1;
  pthread_create(&thread2, NULL, p_fill_circle, (void *)args);
  pthread_join(thread1, NULL);
  pthread_join(thread2, NULL);

  free(args);
  return 0;
}

int full_mutate_circle (circle *cir, int width, int height) {
  cir->col = rand() & 0x00ffffff;
  cir->alpha = rand() % 100;
  cir->rad = rand() % 30 + 5;
  cir->x = rand() % width;
  cir->y = rand() % height;
  return 0;
}

int item_mutate_circle (circle *cir, int width, int height) {
  int t;
  t = rand() % 5;

  switch (t) {
  case 0: cir->col = rand() & 0x00ffffff;
    break;
  case 1: cir->alpha = rand() % 100;
    break;
  case 2: cir->x = rand() % width;
    break;
  case 3: cir->y = rand() % height;
    break;
  case 4: cir->rad = rand() % 30 + 5;
    break;
  default: printf("something's wrong\n");
  }
  return 0;
}

int delta_mutate_circle (circle *cir, int width, int height) {
  int t, d, tmp;
  t = rand() % 5;
  d = (rand() % 5) - 2;

  switch (t) {
  case 0: 
    tmp = cir->col + d;
    if ((tmp > 0) & (tmp < 0x00ffffff))
      cir->col = tmp;
    break;
  case 1:
    tmp = cir->alpha + d;
    if ((tmp > 0) & (tmp < 100))
      cir->alpha = tmp;
    break;
  case 2: 
    tmp = cir->x + d;
    if ((tmp > 0) & (tmp < width))
      cir->x = tmp;
    break;
  case 3:
    tmp = cir->y + d;
    if ((tmp > 0) & (tmp < height))
      cir->y = tmp;
    break;
  case 4:
    tmp = cir->rad + d;
    if ((tmp >= 5) & (tmp < 35))
      cir->rad = tmp;
    break;
  default: printf("something's wrong\n");
  }
  return 0;
}

double p_diff_buffers (void *p) {
  diff_args *args;
  buf *buf1;
  buf *buf2;
  double res = 0;
  int i, j;
  unsigned char r1, g1, b1, r2, g2, b2;

  args = (diff_args *)p;
  buf1 = args->buf1;
  buf2 = args->buf2;

  if ((buf1->width != buf2->width) ||
      (buf1->height != buf2->height)) {
    printf("buffer sizes don't match\n");
    exit(1);
  }

  for (i = 0; i < buf1->height; i++) {
    for (j = 0; j < buf1->width; j++) {
	r1 = buf1->raw[i][j] & 0x000000ff;
	g1 = (buf1->raw[i][j] & 0x0000ff00) >> 8;
	b1 = (buf1->raw[i][j] & 0x00ff0000) >> 16;

	r2 = buf2->raw[i][j] & 0x000000ff;
	g2 = (buf2->raw[i][j] & 0x0000ff00) >> 8;
	b2 = (buf2->raw[i][j] & 0x00ff0000) >> 16;

	res += sqrt((r1 - r2)*(r1 - r2) + (g1 - g2)*(g1 - g2) + (b1 - b2)*(b1 - b2));
    }
  }
  args->res = res;
  pthread_exit(NULL);
}

int main (int argc,  char *argv[]) {
  buf *b1, *b2, *target;
  int i, j, m, e=0;

  char *filename = argv[1];
  char save_to[255];
  circle *cir1[NUMBER_OF_CIRCLES], *cir2[NUMBER_OF_CIRCLES];
  int (*mutate)(circle *cir, int w, int h);

  pthread_t thread1, thread2;
  diff_args *args1, *args2;
  cir_args *args3;

  args1 = malloc(sizeof(diff_args));
  args2 = malloc(sizeof(diff_args));
  args3 = malloc(sizeof(cir_args));
  
  srand(time(NULL));

  target = load_buf(filename);

  b1 = init_buf(target->width, target->height);
  b2 = init_buf(target->width, target->height);

  for (i = 0; i < NUMBER_OF_CIRCLES; i++) {
    cir1[i] = malloc(sizeof(circle));
    cir2[i] = malloc(sizeof(circle));
    cir1[i]->x = rand() % b1->width;
    cir1[i]->y = rand() % b1->height;
    cir1[i]->rad = rand() % 40 + 5;
    cir1[i]->col = rand() & 0x00ffffff;
    cir1[i]->alpha = 80;
    memcpy(cir2[i], cir1[i], sizeof(circle));
  }

  // initial pic

  for (i = 0; i < NUMBER_OF_CIRCLES; i++) {
    fill_circle(b1, cir1[i]);
    fill_circle(b2, cir2[i]);
  }
  save_buf(b1, "original.ppm");
  save_buf(target, "target.ppm");

  // evolve pic

  for (j = 0; j <= GEN1+GEN2+GEN3; j++) {
    m = rand() % NUMBER_OF_CIRCLES;

    switch(j) {
    case 0:
      mutate = full_mutate_circle;
      break;
    case GEN1:
      mutate = item_mutate_circle;
      break;
    case GEN1+GEN2:
      mutate = delta_mutate_circle;
	break;
    }
    mutate(cir1[m], b1->width, b1->height);

    clear_buf(b1);
    for (i = 0; i < NUMBER_OF_CIRCLES; i++) {
      fill_circle(b1, cir1[i]);
    }
    
    args1->buf1 = b1; args1->buf2 = target;
    args2->buf1 = b2; args2->buf2 = target;
    pthread_create(&thread1, NULL, p_diff_buffers, (void *)args1);
    pthread_create(&thread2, NULL, p_diff_buffers, (void *)args2);
    pthread_join(thread1, NULL);
    pthread_join(thread2, NULL);
    //    printf("%f,%f\n", args1->res, args2->res);

    if (args1->res < args2->res) {
      memcpy(cir2[m], cir1[m], sizeof(circle));
      clear_buf(b2);
      for (i = 0; i < NUMBER_OF_CIRCLES; i++) {
	fill_circle(b2, cir2[i]);
      }
      e++;
    } else{
      memcpy(cir1[m], cir2[m], sizeof(circle));
    }
    if ((j % 10) == 0) {
      printf ("%d,   %d\n", j, e);
    }
    if ((j % 1000) == 0) {
      sprintf(save_to, "%d.ppm", j);
      printf("saving the current image %d\n", j);
      save_buf(b1, save_to);
    }
  }

  // result pic

  for (i = 0; i < NUMBER_OF_CIRCLES; i++) {
    fill_circle(b1, cir1[i]);
  }
  save_buf(b1, "result.ppm");

  return 0;
}