並列化よりも、まず最適化オプション
円を塗りつぶす処理を並列化してみたけど、ぜんぜん速度向上につながらない。よく考えてみると、並列化したループは、せいぜい50〜60回のものでしかないから、スレッド生成の準備とかオーバーヘッドで相殺されてるのかも。
それより、gccの最適化オプションを変えたら劇的に高速化して驚いた。今まで何やってたんだ……。
「-O3」は効果の割に、あまりテストされてないケースが多いので、一般に使わないとか。
「-O2」と「-O2 -march=native」でも、これほど違うってことは、色々と自分でビルドしたほうが快適になるってことかしら? と思ってあれこれ調べたら、どうもバイナリサイズが大きなアプリケーションではディスクI/Oのオーバーヘッドが大きいので、サイズ最適化の-Osオプションのほうが速くなるものらしい。Mozillaのページには、経験的にモジュール単位で最適化オプションを選んでいると書いてあった。
もう少しgccのオプション、特にSIMD系の命令を活用したループのベクトル化あたりが気になって、
-ftree-vectorize -ftree-vectorizer-verbose=5
とかやってみたけど、今の例では
paint.c:93: note: not vectorized: number of iterations cannot be computed. paint.c:61: note: not vectorized: unhandled data-ref paint.c:44: note: not vectorized: unhandled data-ref : :
とかで何ひとつ最適化されなかった。
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <math.h> #include <pthread.h> #include <unistd.h> # define NUMBER_OF_CIRCLES 200 # define GEN1 10000 # define GEN2 10000 # define GEN3 10000 # define BGCOLOR 0x00ffffff typedef struct { int height; int width; int **raw; } buf; typedef struct { int x; int y; int rad; int col; int alpha; } circle; typedef struct { buf *buf1; buf *buf2; double res; } diff_args; typedef struct { buf *buf; circle *cir; int begin; } cir_args; int clear_buf (buf *b) { int i, j; for (i = 0; i < b->height; i++) { for (j = 0; j < b->width; j++) { b->raw[i][j] = BGCOLOR; } } return 0; } buf *init_buf (int width, int height) { buf *b; int i; b = malloc(sizeof(buf)); b->height = height; b->width =width; b->raw = malloc(sizeof(int*) * height); b->raw[0] = malloc(sizeof(int) * width * height); for (i = 1; i < height; i ++) { b->raw[i] = b->raw[0] + (width * i); } clear_buf(b); return b; } buf *load_buf (char *filename) { FILE *fp; char s[255], size[255]; int i, j; int width, height; buf *b; if ((fp = fopen(filename, "r")) == NULL) { printf("Can't open the file\n"); exit(1); } // ppm header fgets(s, 255, fp); // P6 fgets(size, 255, fp); // 320 240 fgets(s, 255, fp); // 255 width = atoi(size); height = atoi(strstr(size, " ")); b = init_buf(width, height); for (i = 0; i < b->height; i++) { for (j = 0; j < b->width; j++) { fread(&b->raw[i][j], sizeof(char), 3, fp); } } fclose(fp); return b; } int save_buf (buf *b, char *filename) { FILE *fp; int i, j; if ((fp = fopen(filename, "w")) == NULL) { printf("Can't open the file\n"); exit(1); } fprintf(fp, "P6\n%d %d\n255\n", b->width, b->height); for (i = 0; i < b->height; i++) { for (j = 0; j < b->width; j++) { fwrite(&b->raw[i][j], sizeof(char), 3, fp); } } fclose(fp); return 0; } int p_fill_circle (void *p) { cir_args *args; buf *buf; circle *cir; int i, j; int x, y, rad; int upper, lower, left, right; float back, fore; unsigned char r, g, b; unsigned char r2, g2, b2; args = (cir_args *)p; buf = args->buf; cir = args->cir; if ((cir->alpha < 0) || (cir->alpha >100)) { return 1; } r2 = cir->col & 0x000000ff; g2 = (cir->col & 0x0000ff00) >>8; b2 = (cir->col & 0x00ff0000) >> 16; fore = (float)cir->alpha / 100; back = (100 - (float)cir->alpha) / 100; x = cir->x; y = cir->y; rad = cir->rad; upper = (y - rad) > 0 ? (y - rad) : 0; lower = (y + rad) > buf->height ? buf->height : (y + rad); left = (x - rad) > 0 ? (x - rad) : 0; right = (x + rad) > buf->width ? buf->width : (x + rad); for (i = upper + args->begin; i < lower; i+=2) { for (j = left; j < right; j++) { if ((rad*rad - (y-i)*(y-i) - (x-j)*(x-j)) > 0) { r = buf->raw[i][j] & 0x000000ff; g = (buf->raw[i][j] & 0x0000ff00) >> 8; b = (buf->raw[i][j] & 0x00ff0000) >> 16; r = (r2 * fore) + (r * back); g = (g2 * fore) + (g * back); b = (b2 * fore) + (b * back); buf->raw[i][j] = (b << 16) + (g << 8) + r; } } } pthread_exit(NULL); } int fill_circle (buf *buf, circle *cir) { pthread_t thread1, thread2; cir_args *args; args = malloc(sizeof(cir_args)); args->buf = buf; args->cir = cir; args->begin = 0; pthread_create(&thread1, NULL, p_fill_circle, (void *)args); args->begin = 1; pthread_create(&thread2, NULL, p_fill_circle, (void *)args); pthread_join(thread1, NULL); pthread_join(thread2, NULL); free(args); return 0; } int full_mutate_circle (circle *cir, int width, int height) { cir->col = rand() & 0x00ffffff; cir->alpha = rand() % 100; cir->rad = rand() % 30 + 5; cir->x = rand() % width; cir->y = rand() % height; return 0; } int item_mutate_circle (circle *cir, int width, int height) { int t; t = rand() % 5; switch (t) { case 0: cir->col = rand() & 0x00ffffff; break; case 1: cir->alpha = rand() % 100; break; case 2: cir->x = rand() % width; break; case 3: cir->y = rand() % height; break; case 4: cir->rad = rand() % 30 + 5; break; default: printf("something's wrong\n"); } return 0; } int delta_mutate_circle (circle *cir, int width, int height) { int t, d, tmp; t = rand() % 5; d = (rand() % 5) - 2; switch (t) { case 0: tmp = cir->col + d; if ((tmp > 0) & (tmp < 0x00ffffff)) cir->col = tmp; break; case 1: tmp = cir->alpha + d; if ((tmp > 0) & (tmp < 100)) cir->alpha = tmp; break; case 2: tmp = cir->x + d; if ((tmp > 0) & (tmp < width)) cir->x = tmp; break; case 3: tmp = cir->y + d; if ((tmp > 0) & (tmp < height)) cir->y = tmp; break; case 4: tmp = cir->rad + d; if ((tmp >= 5) & (tmp < 35)) cir->rad = tmp; break; default: printf("something's wrong\n"); } return 0; } double p_diff_buffers (void *p) { diff_args *args; buf *buf1; buf *buf2; double res = 0; int i, j; unsigned char r1, g1, b1, r2, g2, b2; args = (diff_args *)p; buf1 = args->buf1; buf2 = args->buf2; if ((buf1->width != buf2->width) || (buf1->height != buf2->height)) { printf("buffer sizes don't match\n"); exit(1); } for (i = 0; i < buf1->height; i++) { for (j = 0; j < buf1->width; j++) { r1 = buf1->raw[i][j] & 0x000000ff; g1 = (buf1->raw[i][j] & 0x0000ff00) >> 8; b1 = (buf1->raw[i][j] & 0x00ff0000) >> 16; r2 = buf2->raw[i][j] & 0x000000ff; g2 = (buf2->raw[i][j] & 0x0000ff00) >> 8; b2 = (buf2->raw[i][j] & 0x00ff0000) >> 16; res += sqrt((r1 - r2)*(r1 - r2) + (g1 - g2)*(g1 - g2) + (b1 - b2)*(b1 - b2)); } } args->res = res; pthread_exit(NULL); } int main (int argc, char *argv[]) { buf *b1, *b2, *target; int i, j, m, e=0; char *filename = argv[1]; char save_to[255]; circle *cir1[NUMBER_OF_CIRCLES], *cir2[NUMBER_OF_CIRCLES]; int (*mutate)(circle *cir, int w, int h); pthread_t thread1, thread2; diff_args *args1, *args2; cir_args *args3; args1 = malloc(sizeof(diff_args)); args2 = malloc(sizeof(diff_args)); args3 = malloc(sizeof(cir_args)); srand(time(NULL)); target = load_buf(filename); b1 = init_buf(target->width, target->height); b2 = init_buf(target->width, target->height); for (i = 0; i < NUMBER_OF_CIRCLES; i++) { cir1[i] = malloc(sizeof(circle)); cir2[i] = malloc(sizeof(circle)); cir1[i]->x = rand() % b1->width; cir1[i]->y = rand() % b1->height; cir1[i]->rad = rand() % 40 + 5; cir1[i]->col = rand() & 0x00ffffff; cir1[i]->alpha = 80; memcpy(cir2[i], cir1[i], sizeof(circle)); } // initial pic for (i = 0; i < NUMBER_OF_CIRCLES; i++) { fill_circle(b1, cir1[i]); fill_circle(b2, cir2[i]); } save_buf(b1, "original.ppm"); save_buf(target, "target.ppm"); // evolve pic for (j = 0; j <= GEN1+GEN2+GEN3; j++) { m = rand() % NUMBER_OF_CIRCLES; switch(j) { case 0: mutate = full_mutate_circle; break; case GEN1: mutate = item_mutate_circle; break; case GEN1+GEN2: mutate = delta_mutate_circle; break; } mutate(cir1[m], b1->width, b1->height); clear_buf(b1); for (i = 0; i < NUMBER_OF_CIRCLES; i++) { fill_circle(b1, cir1[i]); } args1->buf1 = b1; args1->buf2 = target; args2->buf1 = b2; args2->buf2 = target; pthread_create(&thread1, NULL, p_diff_buffers, (void *)args1); pthread_create(&thread2, NULL, p_diff_buffers, (void *)args2); pthread_join(thread1, NULL); pthread_join(thread2, NULL); // printf("%f,%f\n", args1->res, args2->res); if (args1->res < args2->res) { memcpy(cir2[m], cir1[m], sizeof(circle)); clear_buf(b2); for (i = 0; i < NUMBER_OF_CIRCLES; i++) { fill_circle(b2, cir2[i]); } e++; } else{ memcpy(cir1[m], cir2[m], sizeof(circle)); } if ((j % 10) == 0) { printf ("%d, %d\n", j, e); } if ((j % 1000) == 0) { sprintf(save_to, "%d.ppm", j); printf("saving the current image %d\n", j); save_buf(b1, save_to); } } // result pic for (i = 0; i < NUMBER_OF_CIRCLES; i++) { fill_circle(b1, cir1[i]); } save_buf(b1, "result.ppm"); return 0; }