Tim Allen 8c0b0fa4ad Update to v093r02 release.
byuu says:

Changelog:
- nall: fixed major memory leak in string class
- ruby: video shaders support #define-based settings now
- phoenix/GTK+: support > 256x256 icons for window / task bar / alt-tab
- sfc: remove random/ and config/, merge into system/
- ethos: delete higan.png (48x48), replace with higan512.png (512x512)
  as new higan.png
- ethos: default gamma to 100% (no color adjustment)
- ethos: use "Video Shaders/Display Emulation/" instead of "Video
  Shaders/Emulation/"
- use g++ instead of g++-4.7 (g++ -v must be >= 4.7)
- use -std=c++11 instead of -std=gnu++11
- applied a few patches from Debian upstream to make their packaging job
  easier

So because colors are normalized in GLSL, I won't be able to offer video
shaders absolute color literals. We will have to perform basic color
conversion inside the core.

As such, the current plan is to create some sort of Emulator::Settings
interface. With that, I'll connect an option for color correction, which
will be on by default. For FC/SFC, that will mean gamma correction
(darker / stronger colors), and for GB/GBC/GBA, it will mean simulating
the weird brightness levels of the displays. I am undecided on whether
to use pea soup green for the GB or not. By not doing so, it'll be
easier for the display emulation shader to do it.
2013-11-09 22:45:54 +11:00

601 lines
18 KiB
C++

// If these types are changed to anything other than "float", you should comment out the SSE detection directives below
// so that the SSE code is not used.
typedef float resample_coeff_t; // note: sizeof(resample_coeff_t) must be == to a power of 2, and not larger than 16
typedef float resample_samp_t;
// ...but don't comment this single RESAMPLE_SSEREGPARM define out when disabling SSE.
#define RESAMPLE_SSEREGPARM
#if defined(__SSE__)
#define SINCRESAMPLE_USE_SSE 1
#ifndef __x86_64__
#undef RESAMPLE_SSEREGPARM
#define RESAMPLE_SSEREGPARM __attribute__((sseregparm))
#endif
#else
// TODO: altivec here
#endif
namespace ResampleUtility
{
inline void kaiser_window(double* io, int count, double beta);
inline void gen_sinc(double* out, int size, double cutoff, double kaiser);
inline void gen_sinc_os(double* out, int size, double cutoff, double kaiser);
inline void normalize(double* io, int size, double gain = 1.0);
inline void* make_aligned(void* ptr, unsigned boundary); // boundary must be a power of 2
}
class SincResampleHR
{
private:
inline void Init(unsigned ratio_arg, double desired_bandwidth, double beta, double d);
inline void write(resample_samp_t sample) RESAMPLE_SSEREGPARM;
inline resample_samp_t read(void) RESAMPLE_SSEREGPARM;
inline bool output_avail(void);
private:
inline resample_samp_t mac(const resample_samp_t *wave, const resample_coeff_t *coeff, unsigned count);
unsigned ratio;
unsigned num_convolutions;
resample_coeff_t *coeffs;
std::vector<unsigned char> coeffs_mem;
// second half of ringbuffer should be copy of first half.
resample_samp_t *rb;
std::vector<unsigned char> rb_mem;
signed rb_readpos;
signed rb_writepos;
signed rb_in;
signed rb_eff_size;
friend class SincResample;
};
class SincResample
{
public:
enum
{
QUALITY_LOW = 0,
QUALITY_MEDIUM = 2,
QUALITY_HIGH = 4
};
inline SincResample(double input_rate, double output_rate, double desired_bandwidth, unsigned quality = QUALITY_HIGH);
inline void write(resample_samp_t sample) RESAMPLE_SSEREGPARM;
inline resample_samp_t read(void) RESAMPLE_SSEREGPARM;
inline bool output_avail(void);
private:
inline void Init(double input_rate, double output_rate, double desired_bandwidth, double beta, double d, unsigned pn_nume, unsigned phases_min);
inline resample_samp_t mac(const resample_samp_t *wave, const resample_coeff_t *coeffs_a, const resample_coeff_t *coeffs_b, const double ffract, unsigned count) RESAMPLE_SSEREGPARM;
unsigned num_convolutions;
unsigned num_phases;
unsigned step_int;
double step_fract;
double input_pos_fract;
std::vector<resample_coeff_t *> coeffs; // Pointers into coeff_mem.
std::vector<unsigned char> coeff_mem;
std::vector<resample_samp_t> rb; // second half should be copy of first half.
signed rb_readpos;
signed rb_writepos;
signed rb_in;
bool hr_used;
SincResampleHR hr;
};
//
// Code:
//
//#include "resample.hpp"
#if 0
namespace bit
{
inline unsigned round(unsigned x) {
if((x & (x - 1)) == 0) return x;
while(x & (x - 1)) x &= x - 1;
return x << 1;
}
}
#endif
void SincResampleHR::Init(unsigned ratio_arg, double desired_bandwidth, double beta, double d)
{
const unsigned align_boundary = 16;
std::vector<double> coeffs_tmp;
double cutoff; // 1.0 = f/2
ratio = ratio_arg;
//num_convolutions = ((unsigned)ceil(d / ((1.0 - desired_bandwidth) / ratio)) + 1) &~ 1; // round up to be even
num_convolutions = ((unsigned)ceil(d / ((1.0 - desired_bandwidth) / ratio)) | 1);
cutoff = (1.0 / ratio) - (d / num_convolutions);
//printf("%d %d %.20f\n", ratio, num_convolutions, cutoff);
assert(num_convolutions > ratio);
// Generate windowed sinc of POWER
coeffs_tmp.resize(num_convolutions);
//ResampleUtility::gen_sinc(&coeffs_tmp[0], num_convolutions, cutoff, beta);
ResampleUtility::gen_sinc_os(&coeffs_tmp[0], num_convolutions, cutoff, beta);
ResampleUtility::normalize(&coeffs_tmp[0], num_convolutions);
// Copy from coeffs_tmp to coeffs~
// We multiply many coefficients at a time in the mac loop, so make sure the last few that don't really
// exist are allocated, zero'd mem.
coeffs_mem.resize(((num_convolutions + 7) &~ 7) * sizeof(resample_coeff_t) + (align_boundary - 1));
coeffs = (resample_coeff_t *)ResampleUtility::make_aligned(&coeffs_mem[0], align_boundary);
for(unsigned i = 0; i < num_convolutions; i++)
coeffs[i] = coeffs_tmp[i];
rb_eff_size = nall::bit::round(num_convolutions * 2) >> 1;
rb_readpos = 0;
rb_writepos = 0;
rb_in = 0;
rb_mem.resize(rb_eff_size * 2 * sizeof(resample_samp_t) + (align_boundary - 1));
rb = (resample_samp_t *)ResampleUtility::make_aligned(&rb_mem[0], align_boundary);
}
inline bool SincResampleHR::output_avail(void)
{
return(rb_in >= (signed)num_convolutions);
}
inline void SincResampleHR::write(resample_samp_t sample)
{
assert(!output_avail());
rb[rb_writepos] = sample;
rb[rb_writepos + rb_eff_size] = sample;
rb_writepos = (rb_writepos + 1) & (rb_eff_size - 1);
rb_in++;
}
resample_samp_t SincResampleHR::mac(const resample_samp_t *wave, const resample_coeff_t *coeff, unsigned count)
{
#if SINCRESAMPLE_USE_SSE
__m128 accum_veca[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };
resample_samp_t accum;
for(unsigned c = 0; c < count; c += 8)
{
for(unsigned i = 0; i < 2; i++)
{
__m128 co[2];
__m128 w[2];
co[i] = _mm_load_ps(&coeff[c + i * 4]);
w[i] = _mm_load_ps(&wave[c + i * 4]);
w[i] = _mm_mul_ps(w[i], co[i]);
accum_veca[i] = _mm_add_ps(w[i], accum_veca[i]);
}
}
__m128 accum_vec = _mm_add_ps(accum_veca[0], accum_veca[1]); //_mm_add_ps(_mm_add_ps(accum_veca[0], accum_veca[1]), _mm_add_ps(accum_veca[2], accum_veca[3]));
accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (1 << 0) | (0 << 2) | (1 << 4) | (0 << 6)));
_mm_store_ss(&accum, accum_vec);
return accum;
#else
resample_samp_t accum[4] = { 0, 0, 0, 0 };
for(unsigned c = 0; c < count; c+= 4)
{
accum[0] += wave[c + 0] * coeff[c + 0];
accum[1] += wave[c + 1] * coeff[c + 1];
accum[2] += wave[c + 2] * coeff[c + 2];
accum[3] += wave[c + 3] * coeff[c + 3];
}
return (accum[0] + accum[1]) + (accum[2] + accum[3]); // don't mess with parentheses(assuming compiler doesn't already, which it may...
#endif
}
resample_samp_t SincResampleHR::read(void)
{
assert(output_avail());
resample_samp_t ret;
ret = mac(&rb[rb_readpos], &coeffs[0], num_convolutions);
rb_readpos = (rb_readpos + ratio) & (rb_eff_size - 1);
rb_in -= ratio;
return ret;
}
SincResample::SincResample(double input_rate, double output_rate, double desired_bandwidth, unsigned quality)
{
const struct
{
double beta;
double d;
unsigned pn_nume;
unsigned phases_min;
} qtab[5] =
{
{ 5.658, 3.62, 4096, 4 },
{ 6.764, 4.32, 8192, 4 },
{ 7.865, 5.0, 16384, 8 },
{ 8.960, 5.7, 32768, 16 },
{ 10.056, 6.4, 65536, 32 }
};
// Sanity checks
assert(ceil(input_rate) > 0);
assert(ceil(output_rate) > 0);
assert(ceil(input_rate / output_rate) <= 1024);
assert(ceil(output_rate / input_rate) <= 1024);
// The simplistic number-of-phases calculation code doesn't work well enough for when desired_bandwidth is close to 1.0 and when
// upsampling.
assert(desired_bandwidth >= 0.25 && desired_bandwidth < 0.96);
assert(quality >= 0 && quality <= 4);
hr_used = false;
#if 1
// Round down to the nearest multiple of 4(so wave buffer remains aligned)
// It also adjusts the effective intermediate sampling rate up slightly, so that the upper frequencies below f/2
// aren't overly attenuated so much. In the future, we might want to do an FFT or something to choose the intermediate rate more accurately
// to virtually eliminate over-attenuation.
unsigned ioratio_rd = (unsigned)floor(input_rate / (output_rate * (1.0 + (1.0 - desired_bandwidth) / 2) )) & ~3;
if(ioratio_rd >= 8)
{
hr.Init(ioratio_rd, desired_bandwidth, qtab[quality].beta, qtab[quality].d); //10.056, 6.4);
hr_used = true;
input_rate /= ioratio_rd;
}
#endif
Init(input_rate, output_rate, desired_bandwidth, qtab[quality].beta, qtab[quality].d, qtab[quality].pn_nume, qtab[quality].phases_min);
}
void SincResample::Init(double input_rate, double output_rate, double desired_bandwidth, double beta, double d, unsigned pn_nume, unsigned phases_min)
{
const unsigned max_mult_atatime = 8; // multiply "granularity". must be power of 2.
const unsigned max_mult_minus1 = (max_mult_atatime - 1);
const unsigned conv_alignment_bytes = 16; // must be power of 2
const double input_to_output_ratio = input_rate / output_rate;
const double output_to_input_ratio = output_rate / input_rate;
double cutoff; // 1.0 = input_rate / 2
std::vector<double> coeff_init_buffer;
// Round up num_convolutions to be even.
if(output_rate > input_rate)
num_convolutions = ((unsigned)ceil(d / (1.0 - desired_bandwidth)) + 1) & ~1;
else
num_convolutions = ((unsigned)ceil(d / (output_to_input_ratio * (1.0 - desired_bandwidth))) + 1) & ~1;
if(output_rate > input_rate) // Upsampling
cutoff = desired_bandwidth;
else // Downsampling
cutoff = output_to_input_ratio * desired_bandwidth;
// Round up to be even.
num_phases = (std::max<unsigned>(pn_nume / num_convolutions, phases_min) + 1) &~1;
// Adjust cutoff to account for the multiple phases.
cutoff = cutoff / num_phases;
assert((num_convolutions & 1) == 0);
assert((num_phases & 1) == 0);
// fprintf(stderr, "num_convolutions=%u, num_phases=%u, total expected coeff byte size=%lu\n", num_convolutions, num_phases,
// (long)((num_phases + 2) * ((num_convolutions + max_mult_minus1) & ~max_mult_minus1) * sizeof(float) + conv_alignment_bytes));
coeff_init_buffer.resize(num_phases * num_convolutions);
coeffs.resize(num_phases + 1 + 1);
coeff_mem.resize((num_phases + 1 + 1) * ((num_convolutions + max_mult_minus1) &~ max_mult_minus1) * sizeof(resample_coeff_t) + conv_alignment_bytes);
// Assign aligned pointers into coeff_mem
{
resample_coeff_t *base_ptr = (resample_coeff_t *)ResampleUtility::make_aligned(&coeff_mem[0], conv_alignment_bytes);
for(unsigned phase = 0; phase < (num_phases + 1 + 1); phase++)
{
coeffs[phase] = base_ptr + (((num_convolutions + max_mult_minus1) & ~max_mult_minus1) * phase);
}
}
ResampleUtility::gen_sinc(&coeff_init_buffer[0], num_phases * num_convolutions, cutoff, beta);
ResampleUtility::normalize(&coeff_init_buffer[0], num_phases * num_convolutions, num_phases);
// Reorder coefficients to allow for more efficient convolution.
for(int phase = -1; phase < ((int)num_phases + 1); phase++)
{
for(int conv = 0; conv < (int)num_convolutions; conv++)
{
double coeff;
if(phase == -1 && conv == 0)
coeff = 0;
else if(phase == (int)num_phases && conv == ((int)num_convolutions - 1))
coeff = 0;
else
coeff = coeff_init_buffer[conv * num_phases + phase];
coeffs[phase + 1][conv] = coeff;
}
}
// Free a bit of mem
coeff_init_buffer.resize(0);
step_int = floor(input_to_output_ratio);
step_fract = input_to_output_ratio - step_int;
input_pos_fract = 0;
// Do NOT use rb.size() later in the code, since it'll include the padding.
// We should only need one "max_mult_minus1" here, not two, since it won't matter if it over-reads(due to doing "max_mult_atatime" multiplications at a time
// rather than just 1, in which case this over-read wouldn't happen), from the first half into the duplicated half,
// since those corresponding coefficients will be zero anyway; this is just to handle the case of reading off the end of the duplicated half to
// prevent illegal memory accesses.
rb.resize(num_convolutions * 2 + max_mult_minus1);
rb_readpos = 0;
rb_writepos = 0;
rb_in = 0;
}
resample_samp_t SincResample::mac(const resample_samp_t *wave, const resample_coeff_t *coeffs_a, const resample_coeff_t *coeffs_b, const double ffract, unsigned count)
{
resample_samp_t accum = 0;
#if SINCRESAMPLE_USE_SSE
__m128 accum_vec_a[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };
__m128 accum_vec_b[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };
for(unsigned c = 0; c < count; c += 8) //8) //4)
{
__m128 coeff_a[2];
__m128 coeff_b[2];
__m128 w[2];
__m128 result_a[2], result_b[2];
for(unsigned i = 0; i < 2; i++)
{
coeff_a[i] = _mm_load_ps(&coeffs_a[c + (i * 4)]);
coeff_b[i] = _mm_load_ps(&coeffs_b[c + (i * 4)]);
w[i] = _mm_loadu_ps(&wave[c + (i * 4)]);
result_a[i] = _mm_mul_ps(coeff_a[i], w[i]);
result_b[i] = _mm_mul_ps(coeff_b[i], w[i]);
accum_vec_a[i] = _mm_add_ps(result_a[i], accum_vec_a[i]);
accum_vec_b[i] = _mm_add_ps(result_b[i], accum_vec_b[i]);
}
}
__m128 accum_vec, av_a, av_b;
__m128 mult_a_vec = _mm_set1_ps(1.0 - ffract);
__m128 mult_b_vec = _mm_set1_ps(ffract);
av_a = _mm_mul_ps(mult_a_vec, /*accum_vec_a[0]);*/ _mm_add_ps(accum_vec_a[0], accum_vec_a[1]));
av_b = _mm_mul_ps(mult_b_vec, /*accum_vec_b[0]);*/ _mm_add_ps(accum_vec_b[0], accum_vec_b[1]));
accum_vec = _mm_add_ps(av_a, av_b);
accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (1 << 0) | (0 << 2) | (1 << 4) | (0 << 6)));
_mm_store_ss(&accum, accum_vec);
#else
resample_coeff_t mult_a = 1.0 - ffract;
resample_coeff_t mult_b = ffract;
for(unsigned c = 0; c < count; c += 4)
{
accum += wave[c + 0] * (coeffs_a[c + 0] * mult_a + coeffs_b[c + 0] * mult_b);
accum += wave[c + 1] * (coeffs_a[c + 1] * mult_a + coeffs_b[c + 1] * mult_b);
accum += wave[c + 2] * (coeffs_a[c + 2] * mult_a + coeffs_b[c + 2] * mult_b);
accum += wave[c + 3] * (coeffs_a[c + 3] * mult_a + coeffs_b[c + 3] * mult_b);
}
#endif
return accum;
}
inline bool SincResample::output_avail(void)
{
return(rb_in >= (int)num_convolutions);
}
resample_samp_t SincResample::read(void)
{
assert(output_avail());
double phase = input_pos_fract * num_phases - 0.5;
signed phase_int = (signed)floor(phase);
double phase_fract = phase - phase_int;
unsigned phase_a = num_phases - 1 - phase_int;
unsigned phase_b = phase_a - 1;
resample_samp_t ret;
ret = mac(&rb[rb_readpos], &coeffs[phase_a + 1][0], &coeffs[phase_b + 1][0], phase_fract, num_convolutions);
unsigned int_increment = step_int;
input_pos_fract += step_fract;
int_increment += floor(input_pos_fract);
input_pos_fract -= floor(input_pos_fract);
rb_readpos = (rb_readpos + int_increment) % num_convolutions;
rb_in -= int_increment;
return ret;
}
inline void SincResample::write(resample_samp_t sample)
{
assert(!output_avail());
if(hr_used)
{
hr.write(sample);
if(hr.output_avail())
{
sample = hr.read();
}
else
{
return;
}
}
rb[rb_writepos + 0 * num_convolutions] = sample;
rb[rb_writepos + 1 * num_convolutions] = sample;
rb_writepos = (rb_writepos + 1) % num_convolutions;
rb_in++;
}
void ResampleUtility::kaiser_window( double* io, int count, double beta)
{
int const accuracy = 24; //16; //12;
double* end = io + count;
double beta2 = beta * beta * (double) -0.25;
double to_fract = beta2 / ((double) count * count);
double i = 0;
double rescale = 0; // Doesn't need an initializer, to shut up gcc
for ( ; io < end; ++io, i += 1 )
{
double x = i * i * to_fract - beta2;
double u = x;
double k = x + 1;
double n = 2;
do
{
u *= x / (n * n);
n += 1;
k += u;
}
while ( k <= u * (1 << accuracy) );
if ( !i )
rescale = 1 / k; // otherwise values get large
*io *= k * rescale;
}
}
void ResampleUtility::gen_sinc(double* out, int size, double cutoff, double kaiser)
{
assert( size % 2 == 0 ); // size must be even
int const half_size = size / 2;
double* const mid = &out [half_size];
// Generate right half of sinc
for ( int i = 0; i < half_size; i++ )
{
double angle = (i * 2 + 1) * (Math::Pi / 2);
mid [i] = sin( angle * cutoff ) / angle;
}
kaiser_window( mid, half_size, kaiser );
// Mirror for left half
for ( int i = 0; i < half_size; i++ )
out [i] = mid [half_size - 1 - i];
}
void ResampleUtility::gen_sinc_os(double* out, int size, double cutoff, double kaiser)
{
assert( size % 2 == 1); // size must be odd
for(int i = 0; i < size; i++)
{
if(i == (size / 2))
out[i] = 2 * Math::Pi * (cutoff / 2); //0.078478; //1.0; //sin(2 * M_PI * (cutoff / 2) * (i - size / 2)) / (i - (size / 2));
else
out[i] = sin(2 * Math::Pi * (cutoff / 2) * (i - size / 2)) / (i - (size / 2));
// out[i] *= 0.3635819 - 0.4891775 * cos(2 * M_PI * i / (size - 1)) + 0.1365995 * cos(4 * M_PI * i / (size - 1)) - 0.0106411 * cos(6 * M_PI * i / (size - 1));
//0.42 - 0.5 * cos(2 * M_PI * i / (size - 1)) + 0.08 * cos(4 * M_PI * i / (size - 1));
// printf("%d %f\n", i, out[i]);
}
kaiser_window(&out[size / 2], size / 2 + 1, kaiser);
// Mirror for left half
for ( int i = 0; i < size / 2; i++ )
out [i] = out [size - 1 - i];
}
void ResampleUtility::normalize(double* io, int size, double gain)
{
double sum = 0;
for ( int i = 0; i < size; i++ )
sum += io [i];
double scale = gain / sum;
for ( int i = 0; i < size; i++ )
io [i] *= scale;
}
void* ResampleUtility::make_aligned(void* ptr, unsigned boundary)
{
unsigned char* null_ptr = (unsigned char *)nullptr;
unsigned char* uc_ptr = (unsigned char *)ptr;
uc_ptr += (boundary - ((uc_ptr - null_ptr) & (boundary - 1))) & (boundary - 1);
//while((uc_ptr - null_ptr) & (boundary - 1))
// uc_ptr++;
//printf("%16llx %16llx\n", (unsigned long long)ptr, (unsigned long long)uc_ptr);
assert((uc_ptr - (unsigned char *)ptr) < boundary && (uc_ptr >= (unsigned char *)ptr));
return uc_ptr;
}