Calculate Newtonian gravity using fast Fourier transforms

About 16% slower for one cell changing, same speed for 6 cells changing,
and several hundred times faster for whole screen changing.
This commit is contained in:
jacksonmj 2011-07-15 05:16:11 +08:00 committed by Simon Robertshaw
parent d20c704d88
commit 788b0cd04c
5 changed files with 175 additions and 4 deletions

View File

@ -7,9 +7,9 @@ PY_INCPATH := $(shell $(PY_BIN) -c "import os.path,sys;print os.path.join(sys.ex
PY_LDFLAGS := $(shell $(PY_BIN) -c "import distutils.sysconfig;print distutils.sysconfig.get_config_var('LINKFORSHARED')")
PYCOMMAND := $(PY_BIN) getheader.py
CFLAGS := -w -std=c99 -D_POSIX_C_SOURCE=200112L -DLUACONSOLE -Iincludes/
CFLAGS := -w -std=c99 -D_POSIX_C_SOURCE=200112L -DLUACONSOLE -DGRAVFFT -Iincludes/
OFLAGS := -O3 -ffast-math -ftree-vectorize -funsafe-math-optimizations
LFLAGS := -lpthread -lSDL -lm -lbz2 -lX11 -llua5.1 #-lpython$(PY_VERSION) -L$(PY_LIBPATH) -I$(PY_INCPATH) $(PY_LDFLAGS)
LFLAGS := -lpthread -lSDL -lfftw3f -lm -lbz2 -lX11 -llua5.1 #-lpython$(PY_VERSION) -L$(PY_LIBPATH) -I$(PY_INCPATH) $(PY_LDFLAGS)
LFLAGS_X := -lm -lbz2 -lSDLmain -I/Library/Frameworks/Python.framework/Versions/$(PY_VERSION)/include/python$(PY_VERSION)
MFLAGS_SSE3 := -march=native -DX86 -DX86_SSE3 -msse3
MFLAGS_SSE2 := -march=native -DX86 -DX86_SSE2 -msse2

View File

@ -41,6 +41,11 @@ void update_airh(void);
void update_grav(void);
#ifdef GRAVFFT
void grav_fft_init();
void grav_fft_cleanup();
#endif
void update_air(void);
#endif

View File

@ -92,6 +92,7 @@ extern unsigned char ZSIZE;
#define BRUSH_NUM 3
//#define GRAVFFT
//#define LUACONSOLE
//#define PYCONSOLE
//#define PYEXT

161
src/air.c
View File

@ -2,6 +2,11 @@
#include <air.h>
#include <powder.h>
#include <defines.h>
#ifdef GRAVFFT
#include <fftw3.h>
#endif
float kernel[9];
float gravmap[YRES/CELL][XRES/CELL]; //Maps to be used by the main thread
@ -154,6 +159,159 @@ void bilinear_interpolation(float *src, float *dst, int sw, int sh, int rw, int
}
}
#ifdef GRAVFFT
int grav_fft_status = 0;
float *th_ptgravx, *th_ptgravy, *th_gravmapbig, *th_gravxbig, *th_gravybig;
fftwf_complex *th_ptgravxt, *th_ptgravyt, *th_gravmapbigt, *th_gravxbigt, *th_gravybigt;
fftwf_plan plan_gravmap, plan_gravx_inverse, plan_gravy_inverse;
void grav_fft_init()
{
int xblock2 = XRES/CELL*2;
int yblock2 = YRES/CELL*2;
int x, y, fft_tsize = (xblock2/2+1)*yblock2;
float distance, scaleFactor;
fftwf_plan plan_ptgravx, plan_ptgravy;
if (grav_fft_status) return;
//use fftw malloc function to ensure arrays are aligned, to get better performance
th_ptgravx = fftwf_malloc(xblock2*yblock2*sizeof(float));
th_ptgravy = fftwf_malloc(xblock2*yblock2*sizeof(float));
th_ptgravxt = fftwf_malloc(fft_tsize*sizeof(fftwf_complex));
th_ptgravyt = fftwf_malloc(fft_tsize*sizeof(fftwf_complex));
th_gravmapbig = fftwf_malloc(xblock2*yblock2*sizeof(float));
th_gravmapbigt = fftwf_malloc(fft_tsize*sizeof(fftwf_complex));
th_gravxbig = fftwf_malloc(xblock2*yblock2*sizeof(float));
th_gravybig = fftwf_malloc(xblock2*yblock2*sizeof(float));
th_gravxbigt = fftwf_malloc(fft_tsize*sizeof(fftwf_complex));
th_gravybigt = fftwf_malloc(fft_tsize*sizeof(fftwf_complex));
//select best algorithm, could use FFTW_PATIENT or FFTW_EXHAUSTIVE but that increases the time taken to plan, and I don't see much increase in execution speed
plan_ptgravx = fftwf_plan_dft_r2c_2d(yblock2, xblock2, th_ptgravx, th_ptgravxt, FFTW_MEASURE);
plan_ptgravy = fftwf_plan_dft_r2c_2d(yblock2, xblock2, th_ptgravy, th_ptgravyt, FFTW_MEASURE);
plan_gravmap = fftwf_plan_dft_r2c_2d(yblock2, xblock2, th_gravmapbig, th_gravmapbigt, FFTW_MEASURE);
plan_gravx_inverse = fftwf_plan_dft_c2r_2d(yblock2, xblock2, th_gravxbigt, th_gravxbig, FFTW_MEASURE);
plan_gravy_inverse = fftwf_plan_dft_c2r_2d(yblock2, xblock2, th_gravybigt, th_gravybig, FFTW_MEASURE);
//(XRES/CELL)*(YRES/CELL)*4 is size of data array, scaling needed because FFTW calculates an unnormalized DFT
scaleFactor = -M_GRAV/((XRES/CELL)*(YRES/CELL)*4);
//calculate velocity map caused by a point mass
for (y=0; y<yblock2; y++)
{
for (x=0; x<xblock2; x++)
{
if (x==XRES/CELL && y==YRES/CELL) continue;
distance = sqrtf(pow(x-(XRES/CELL), 2) + pow(y-(YRES/CELL), 2));
th_ptgravx[y*xblock2+x] = scaleFactor*(x-(XRES/CELL)) / pow(distance, 3);
th_ptgravy[y*xblock2+x] = scaleFactor*(y-(YRES/CELL)) / pow(distance, 3);
}
}
th_ptgravx[yblock2*xblock2/2+xblock2/2] = 0.0f;
th_ptgravy[yblock2*xblock2/2+xblock2/2] = 0.0f;
//transform point mass velocity maps
fftwf_execute(plan_ptgravx);
fftwf_execute(plan_ptgravy);
fftwf_destroy_plan(plan_ptgravx);
fftwf_destroy_plan(plan_ptgravy);
fftwf_free(th_ptgravx);
fftwf_free(th_ptgravy);
//clear padded gravmap
memset(th_gravmapbig,0,xblock2*yblock2*sizeof(float));
grav_fft_status = 1;
}
void grav_fft_cleanup()
{
if (!grav_fft_status) return;
fftwf_free(th_ptgravxt);
fftwf_free(th_ptgravyt);
fftwf_free(th_gravmapbig);
fftwf_free(th_gravmapbigt);
fftwf_free(th_gravxbig);
fftwf_free(th_gravybig);
fftwf_free(th_gravxbigt);
fftwf_free(th_gravybigt);
fftwf_destroy_plan(plan_gravmap);
fftwf_destroy_plan(plan_gravx_inverse);
fftwf_destroy_plan(plan_gravy_inverse);
grav_fft_status = 0;
}
void update_grav()
{
int x, y, changed = 0;
for (y=0; y<YRES/CELL; y++)
{
if(changed)
break;
for (x=0; x<XRES/CELL; x++)
{
if(th_ogravmap[y][x]!=th_gravmap[y][x]){
changed = 1;
break;
}
}
}
if(changed)
{
int xblock2 = XRES/CELL*2, yblock2 = YRES/CELL*2;
int i, fft_tsize = (xblock2/2+1)*yblock2;
float mr, mc, pr, pc, gr, gc;
if (!grav_fft_status) grav_fft_init();
//copy gravmap into padded gravmap array
for (y=0; y<YRES/CELL; y++)
{
for (x=0; x<XRES/CELL; x++)
{
th_gravmapbig[(y+YRES/CELL)*xblock2+XRES/CELL+x] = th_gravmap[y][x];
}
}
//transform gravmap
fftwf_execute(plan_gravmap);
//do convolution (multiply the complex numbers)
for (i=0; i<fft_tsize; i++)
{
mr = th_gravmapbigt[i][0];
mc = th_gravmapbigt[i][1];
pr = th_ptgravxt[i][0];
pc = th_ptgravxt[i][1];
gr = mr*pr-mc*pc;
gc = mr*pc+mc*pr;
th_gravxbigt[i][0] = gr;
th_gravxbigt[i][1] = gc;
pr = th_ptgravyt[i][0];
pc = th_ptgravyt[i][1];
gr = mr*pr-mc*pc;
gc = mr*pc+mc*pr;
th_gravybigt[i][0] = gr;
th_gravybigt[i][1] = gc;
}
//inverse transform, and copy from padded arrays into normal velocity maps
fftwf_execute(plan_gravx_inverse);
fftwf_execute(plan_gravy_inverse);
for (y=0; y<YRES/CELL; y++)
{
for (x=0; x<XRES/CELL; x++)
{
th_gravx[y][x] = th_gravxbig[y*xblock2+x];
th_gravy[y][x] = th_gravybig[y*xblock2+x];
th_gravp[y][x] = sqrtf(pow(th_gravxbig[y*xblock2+x],2)+pow(th_gravybig[y*xblock2+x],2));
}
}
}
memcpy(th_ogravmap, th_gravmap, sizeof(th_gravmap));
bilinear_interpolation(th_gravy, th_gravyf, XRES/CELL, YRES/CELL, XRES, YRES);
bilinear_interpolation(th_gravx, th_gravxf, XRES/CELL, YRES/CELL, XRES, YRES);
bilinear_interpolation(th_gravp, th_gravpf, XRES/CELL, YRES/CELL, XRES, YRES);
}
#else
// gravity without fast Fourier transforms
void update_grav(void)
{
int x, y, i, j, changed = 0;
@ -211,6 +369,9 @@ fin:
memcpy(th_ogravmap, th_gravmap, sizeof(th_gravmap));
memset(th_gravmap, 0, sizeof(th_gravmap));
}
#endif
void update_air(void)
{
int x, y, i, j;

View File

@ -1450,6 +1450,9 @@ void* update_grav_async(void* unused)
memset(th_gravmap, 0, sizeof(th_gravmap));
memset(th_gravy, 0, sizeof(th_gravy));
memset(th_gravx, 0, sizeof(th_gravx));
#ifdef GRAVFFT
grav_fft_init();
#endif
while(!thread_done){
if(!done){
update_grav();
@ -1476,8 +1479,6 @@ void* update_grav_async(void* unused)
void start_grav_async()
{
if(!ngrav_enable){
/*pthread_mutexattr_t gma; //I do not know why this is here
pthread_mutexattr_init(&gma);*/
gravthread_done = 0;
pthread_mutex_init (&gravmutex, NULL);
pthread_cond_init(&gravcv, NULL);
@ -3617,6 +3618,9 @@ int main(int argc, char *argv[])
}
SDL_CloseAudio();
http_done();
#ifdef GRAVFFT
grav_fft_cleanup();
#endif
#ifdef LUACONSOLE
luacon_close();
#endif