// ## cp_fatan-cbe-spu.h (C99)
// ## Version 1.0
// ##                        
// ## Copyright (c) 2006 Mike Acton <macton@gmail.com>
// ##                        
// ## SIGNIFICANT REFERENCES:
// ##                        
// ##    [1] Cephes Math Library Release 2.8:  June, 2000
// ##        Copyright 1984, 1995, 2000, Stephen L. Moshier
// ##    [2] Numerical Computation Guide (PDF)
// ##        Copyright 2000, Sun Microsystems, Inc.
// ##    [3] IEEE 754 Support in C99 (PDF)
// ##        Copyright 2001, Jim Thomas
// ##    [4] Solaris 10 Reference Manual : atan2(3M)
// ##        Copyright 1994-2005, Sun Microsystems, Inc.
// ##                        
// ## Permission is hereby granted, free of charge, to any person obtaining
// ## a copy of this software and associated documentation files 
// ## (the "Software"), to deal in the Software without restriction, including
// ## without limitation the rights to use, copy, modify, merge, publish, 
// ## distribute, sublicense, and/or sell copies of the Software, and to permit
// ## persons to whom the Software is furnished to do so, subject to the 
// ## following conditions:
// ##                        
// ## The above copyright notice and this permission notice shall be included 
// ## in all copies or substantial portions of the Software.
// ##                        
// ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
// ## OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
// ## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
// ## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
// ## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
// ## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
// ## THE SOFTWARE.
// ##                        

#ifndef CP_FATAN_CBE_SPU_H
#define CP_FATAN_CBE_SPU_H

#include <stdint.h>
#include <spu_intrinsics.h>

// ##                        
// ## Global Floating-point constants (32 bit)
// ##                        
// ## Constant is loaded in each element of 32 bit floating-point vector
// ## from local store.
// ##                        
// ## cp_flpio4()  +PI/+4
// ## cp_flt3p8()  tan( +3.0 * PI / +8.0 )
// ## cp_flnpio2() -PI/+2
// ## cp_flpio2()  +PI/+2
// ## cp_flpt66()  +0.66
// ## cp_flpi()    +PI
// ## cp_flnpi()   -PI

extern const vector unsigned int _cp_f_pio4;
extern const vector unsigned int _cp_f_t3p8;
extern const vector unsigned int _cp_f_npio2;
extern const vector unsigned int _cp_f_pio2;
extern const vector unsigned int _cp_f_pt66;
extern const vector unsigned int _cp_f_pi;
extern const vector unsigned int _cp_f_npi;

static inline qword
cp_flpio4( void )
{
    return si_lqa( (intptr_t)&_cp_f_pio4 );
}

static inline qword
cp_flt3p8( void )
{
    return si_lqa( (intptr_t)&_cp_f_t3p8 );
}

static inline qword
cp_flnpio2( void )
{
    return si_lqa( (intptr_t)&_cp_f_npio2 );
}

static inline qword
cp_flpio2( void )
{
    return si_lqa( (intptr_t)&_cp_f_pio2 );
}

static inline qword
cp_flpt66( void )
{
    return si_lqa( (intptr_t)&_cp_f_pt66 );
}

static inline qword
cp_flpi( void )
{
    return si_lqa( (intptr_t)&_cp_f_pi );
}

static inline qword
cp_flnpi( void )
{
    return si_lqa( (intptr_t)&_cp_f_npi );
}

// ##                        
// ## Load-Immediate Floating-point constants (32 bit)
// ##                        
// ## Constant is loaded in each element of 32 bit floating-point vector
// ## using immediate values. i.e. No loads
// ##                        
// ## cp_filzero()   +0.0  +0x00000000
// ## cp_filnzero()  -0.0  +0x80000000
// ## cp_filone()    +1.0  +0x3f800000
// ## cp_filtwo()    +2.0  +0x40000000
// ## cp_filinf()    +INF  +0x7f800000
// ## cp_filninf()   -INF  +0xff800000
// ## cp_filnan()     NaN  +0x7fc00000
// ##                        

static inline qword 
cp_filzero( void )
{
    return si_ilhu( (int16_t)0x0000 );
}

static inline qword 
cp_filnzero( void )
{
    return si_ilhu( (int16_t)0x8000 );
}

static inline qword 
cp_filone( void )
{
    return si_ilhu( (int16_t)0x3f80 );
}

static inline qword 
cp_filtwo( void )
{
    return si_ilhu( (int16_t)0x4000 );
}

static inline qword 
cp_filinf( void )
{
    return si_ilhu( (int16_t)0x7f80 );
}

static inline qword 
cp_filninf( void )
{
    return si_ilhu( (int16_t)0xff80 );
}

static inline qword 
cp_filnan( void )
{
    return si_ilhu( (int16_t)0x7fc0 );
}

// ##                        
// ## cp_fatan() Coefficients and other constants
// ##                        

extern const vector unsigned int _cp_f_atan_q4;
extern const vector unsigned int _cp_f_atan_q3;
extern const vector unsigned int _cp_f_atan_q2;
extern const vector unsigned int _cp_f_atan_q1;
extern const vector unsigned int _cp_f_atan_q0;
extern const vector unsigned int _cp_f_atan_p4;
extern const vector unsigned int _cp_f_atan_p3;
extern const vector unsigned int _cp_f_atan_p2;
extern const vector unsigned int _cp_f_atan_p1;
extern const vector unsigned int _cp_f_atan_p0;
extern const vector unsigned int _cp_f_hmorebits;
extern const vector unsigned int _cp_f_morebits;

// ## cp_fatan(x)
// ##                        
// ## 0     <= x           <= 0.66
// ## -PI/2 <= cp_fatan(x) <= +PI/2
// ##                        
// ## Each floating-point component of the result is a function of
// ## the corresponding components of x:
// ##
// ##    0.0                                             { x == 0.0
// ##                        
// ##    +PI                                             {
// ##    ---                                             { x == INF
// ##    2.0                                             {
// ##                        
// ##    -PI                                             {
// ##    ---                                             { x == -INF
// ##    2.0                                             {
// ##                        
// ##                           
// ##                   2      4      6     8            {
// ##           P  + P x  + P x  + P x + P x             {
// ##        2   0    1      2      3     4              {
// ##    x  x   ----------------------------------- + x  { otherwise
// ##                    2     4      6      8   10      {
// ##            Q  + Q x + Q x  + Q x  + Q x + x        {
// ##             0    1     2      3      4             {

static inline qword
_cp_fatan( const qword x )
{
    // ##                        
    // ## Load constants
    // ##                        
    
    const qword f_one           = cp_filone();
    const qword f_inf           = cp_filinf();
    const qword f_ninf          = cp_filninf();
    const qword f_msb           = cp_filnzero();
    const qword f_zero          = cp_filzero();

    const qword f_pt66          = si_lqa( (intptr_t)&_cp_f_pt66      );
    const qword f_pio2          = si_lqa( (intptr_t)&_cp_f_pio2      );
    const qword f_npio2         = si_lqa( (intptr_t)&_cp_f_npio2     );
    const qword f_pio4          = si_lqa( (intptr_t)&_cp_f_pio4      );
    const qword f_t3p8          = si_lqa( (intptr_t)&_cp_f_t3p8      );

    const qword f_atan_p0       = si_lqa( (intptr_t)&_cp_f_atan_p0    );
    const qword f_atan_p1       = si_lqa( (intptr_t)&_cp_f_atan_p1    );
    const qword f_atan_p2       = si_lqa( (intptr_t)&_cp_f_atan_p2    );
    const qword f_atan_p3       = si_lqa( (intptr_t)&_cp_f_atan_p3    );
    const qword f_atan_p4       = si_lqa( (intptr_t)&_cp_f_atan_p4    );
    const qword f_atan_q0       = si_lqa( (intptr_t)&_cp_f_atan_q0    );
    const qword f_atan_q1       = si_lqa( (intptr_t)&_cp_f_atan_q1    );
    const qword f_atan_q2       = si_lqa( (intptr_t)&_cp_f_atan_q2    );
    const qword f_atan_q3       = si_lqa( (intptr_t)&_cp_f_atan_q3    );
    const qword f_atan_q4       = si_lqa( (intptr_t)&_cp_f_atan_q4    );
    const qword f_morebits      = si_lqa( (intptr_t)&_cp_f_morebits  );
    const qword f_hmorebits     = si_lqa( (intptr_t)&_cp_f_hmorebits );
    
    // ##                        
    // ## pos_x = -x            { x < 0
    // ##          x            { otherwise
    // ##                        
    
    const qword neg_x           = si_xor( x, f_msb );          
    const qword sign_mask       = si_fcgt( f_zero, x );
    const qword pos_x           = si_selb( x, neg_x, sign_mask );
    
    // ##                        
    // ## Range reduction
    // ##                        
    
    // ##                        
    // ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) )
    // ## range1_mask = ( pos_x <= 0.66 )
    // ## range2_mask = !( range0_mask || range1_mask )
    // ##                        
    
    const qword range0_mask     = si_fcgt( pos_x, f_t3p8 );
    const qword range1_gt_mask  = si_fcgt( f_pt66, pos_x );
    const qword range1_eq_mask  = si_fceq( f_pt66, pos_x );
    const qword range1_mask     = si_or( range1_gt_mask, range1_eq_mask );
    const qword range2_mask     = si_nor( range0_mask, range1_mask );
    
    // ##                        
    // ## range0_x = -1.0 
    // ##            -----
    // ##            pos_x
    // ##                        
    // ## range0_y = PI
    // ##            ---
    // ##            2.0
    // ##                        
    
    const qword range0_x0       = si_frest( pos_x );
    const qword range0_x1       = si_fi( pos_x, range0_x0 );
    const qword range0_x2       = si_fnms( range0_x1, pos_x, f_one );
    const qword range0_x3       = si_fma( range0_x2, range0_x1, range0_x1 );
    const qword range0_x        = si_xor( range0_x3, f_msb );
    const qword range0_y        = f_pio2;
    
    // ##                        
    // ## range1_x = pos_x
    // ## range1_y = 0.0
    // ##                        
    
    const qword range1_x        = pos_x;
    const qword range1_y        = f_zero;
    
    
    // ##                        
    // ## range2_x = (pos_x-1.0)
    // ##            -----------
    // ##            (pos_x+1.0)
    // ##                        
    // ## range2_y = PI
    // ##            ---
    // ##            4.0
    // ##                        
    
    const qword range2_y        = f_pio4;
    const qword range2_x0num    = si_fs( pos_x, f_one );
    const qword range2_x0den    = si_fa( pos_x, f_one );
    const qword range2_x0       = si_frest( range2_x0den );
    const qword range2_x1       = si_fnms( range2_x0, range2_x0den, f_one );
    const qword range2_x2       = si_fma( range2_x1, range2_x0, range2_x0 );
    const qword range2_x        = si_fm( range2_x0num, range2_x2 );
    
    // ##                        
    // ## range_x  = range0_x { range0_mask
    // ##            range1_x { range1_mask
    // ##            range2_x { range2_mask
    // ##                        
    // ## range_y  = range0_y { range0_mask
    // ##            range1_y { range1_mask
    // ##            range2_y { range2_mask
    // ##                        
    
    const qword range_x0        = si_selb( range2_x, range0_x, range0_mask );
    const qword range_x         = si_selb( range_x0, range1_x, range1_mask );
    const qword range_y0        = si_selb( range2_y, range0_y, range0_mask );
    const qword range_y         = si_selb( range_y0, range1_y, range1_mask );
    
    // ##                        
    // ##                  2
    // ## xp2    =  range_x 
    // ##                             2        3     4
    // ##           P  + P xp2 + P xp2  + P xp2 + P xp2
    // ##            0    1       2        3       4
    // ## zdiv   =  ------------------------------------------
    // ##                             2        3       4     5
    // ##           Q  + Q xp2 + Q xp2  + Q xp2 + Q xp2 + xp2
    // ##            0    1       2        3       4 
    // ## 
    // ## z1     = range_x * ( xp2 * zdiv ) + range_x
    // ## 
    
    const qword xp2             = si_fm( range_x, range_x );
    const qword znum0           = f_atan_p0;
    const qword znum1           = si_fma( znum0, xp2, f_atan_p1 );
    const qword znum2           = si_fma( znum1, xp2, f_atan_p2 );
    const qword znum3           = si_fma( znum2, xp2, f_atan_p3 );
    const qword znum            = si_fma( znum3, xp2, f_atan_p4 );
    const qword zden0           = si_fa( xp2, f_atan_q0 );
    const qword zden1           = si_fma( zden0, xp2, f_atan_q1 );
    const qword zden2           = si_fma( zden1, xp2, f_atan_q2 );
    const qword zden3           = si_fma( zden2, xp2, f_atan_q3 );
    const qword zden            = si_fma( zden3, xp2, f_atan_q4 );
    const qword zden_r0         = si_frest( zden );
    const qword zden_r1         = si_fnms( zden_r0, zden, f_one );
    const qword zden_r          = si_fma( zden_r1, zden_r0, zden_r0 );
    const qword zdiv            = si_fm( znum, zden_r );
    const qword z0              = si_fm( xp2, zdiv );
    const qword z1              = si_fma( range_x, z0, range_x );
    
    // ##                        
    // ## zadd      =  z1 + 0.5 * MOREBITS { range2_mask
    // ##              z1 + MOREBITS       { range1_mask
    // ##              z1                  { otherwise
    // ##                        
    // ## yaddz     = range_y + zadd
    // ##                        
    // ## pos_yaddz = yaddz      { yaddz >= 0
    // ##             -yaddz     { yaddz <  0
    // ##                        

    const qword zadd0           = si_selb( f_zero, f_hmorebits, range2_mask );
    const qword zadd1           = si_selb( zadd0,  f_morebits,  range1_mask );
    const qword zadd            = si_fa( z1, zadd1 );
    const qword yaddz           = si_fa( range_y, zadd );
    const qword neg_yaddz       = si_xor( yaddz, f_msb );
    const qword pos_yaddz       = si_selb( yaddz,  neg_yaddz,  sign_mask );
    
    // ##                        
    // ## result_y0 = 0.0        { x == 0.0
    // ##             pos_yaddz  { otherwise
    // ##                        
    
    const qword x_eqz_mask      = si_fceq( f_zero, x );
    const qword result_y0       = si_selb( pos_yaddz, x, x_eqz_mask );

    // ##                        
    // ## result_y2 = +PI         {
    // ##             ---         { x == INF
    // ##             2.0         {
    // ##                        
    // ##             -PI         {
    // ##             ---         { x == -INF
    // ##             2.0         {
    // ##                        
    // ##             result_y0   { otherwise
    // ##                        

    const qword x_eqinf_mask    = si_fceq( f_inf,  x );
    const qword x_eqninf_mask   = si_fceq( f_ninf, x );
    const qword result_y1       = si_selb( result_y0, f_pio2,  x_eqinf_mask );
    const qword result          = si_selb( result_y1, f_npio2, x_eqninf_mask );

    return (result);
}

static inline vector float
cp_fatan( const vector float x )
{
    return (vector float)( _cp_fatan( (qword)x ) );
}

static inline float
cp_fatan_scalar( const float x )
{
    const qword vx      = si_from_float( x );
    const qword vresult = _cp_fatan( vx );
    const float result  = si_to_float( vresult );

    return (result);
}

// ## cp_fatan2(y,x)
// ## 
// ## -INF <= x              <= INF
// ## -INF <= y              <= INF
// ## -PI  <= cp_fatan2(y,x) <= +PI
// ##                        
// ## Each floating-point component of the result is a function of
// ## the corresponding components of y and x:
// ##                        
// ##     +PI                  { (y == +0.0) && (x < 0.0)
// ##                        
// ##     -PI                  { (y == -0.0) && (x < 0.0)
// ##     
// ##     +0.0                 { (y == +0.0) && (x > 0.0)
// ##     
// ##     -0.0                 { (y == -0.0) && (x > 0.0)
// ##      
// ##     -PI                  {
// ##     ----                 { (y < 0.0) && (x == 0.0)
// ##     +2.0                 {
// ##     
// ##     +PI                  {
// ##     ----                 { (y > 0.0) && (x == 0.0)
// ##     +2.0                 {
// ##     
// ##     NaN                  { (y == NaN) || (x == NaN) 
// ##     
// ##     +PI                  { (y == +0.0) && (x == -0.0)
// ##                        
// ##     -PI                  { (y == -0.0) && (x == -0.0)
// ##                        
// ##     +0.0                 { (y == +0.0) && (x == +0.0)
// ##                        
// ##     -0.0                 { (y == -0.0) && (x == +0.0)
// ##                        
// ##     +PI                  {
// ##     ---                  { (y == +INF) && (x == +INF)
// ##     4.0                  {
// ##                        
// ##     -PI                  {
// ##     ---                  { (y == -INF) && (x == +INF)
// ##     4.0                  {
// ##                        
// ##     +3.0 PI              {
// ##     -------              { (y == +INF) && (x == -INF)
// ##     +4.0                 {
// ##                        
// ##     -3.0 PI              {
// ##     -------              { (y == -INF) && (x == -INF)
// ##     +4.0                 {
// ##                        
// ##     +PI                  { isfinite(y) && (+y > 0) && (x == -INF)
// ##                        
// ##     -PI                  { isfinite(y) && (-y > 0) && (x == -INF)
// ##                        
// ##     +0.0                 { isfinite(y) && (+y > 0) && (x == +INF)
// ##                        
// ##     -0.0                 { isfinite(y) && (-y > 0) && (x == +INF)
// ##                        
// ##     +PI                  {
// ##     ----                 { (isfinite(x) && (y == +INF)
// ##     +2.0                 {
// ##                        
// ##     -PI                  {
// ##     ---                  { (isfinite(x) && (y == -INF)
// ##     +2.0                 {
// ##                        
// ##                   ( y )  {
// ##     +PI  + cp_atan( - )  { ( x <  0.0 ) && ( y >= 0.0 )
// ##                   ( x )  {
// ##                                     
// ##                   ( y )  {
// ##     -PI  + cp_atan( - )  { ( x <  0.0 ) && ( y < 0.0 )
// ##                   ( x )  {
// ##                                     
// ##                   ( y )  {
// ##     +0.0 + cp_atan( - )  { otherwise
// ##                   ( x )  {
// ##                                     

qword _cp_fatan2( qword y, qword x )
{
    const qword f_one       = cp_filone();
    const qword f_zero      = cp_filzero();
    const qword f_pi        = si_lqa( (intptr_t)&_cp_f_pi  );
    const qword f_npi       = si_lqa( (intptr_t)&_cp_f_npi );

    // ##                        
    // ## yox = y
    // ##       -
    // ##       x
    // ##                        
    // ## z   = +PI + cp_atan( yox ) { ( x <  0.0 ) && ( y >= 0.0 )
    // ##       -PI + cp_atan( yox ) { ( x <  0.0 ) && ( y <  0.0 )
    // ##       0.0 + cp_atan( yox ) { otherwise

    const qword x_ltz_mask  = si_fcgt( f_zero, x );
    const qword y_ltz_mask  = si_fcgt( f_zero, y );
    const qword xy_ltz_mask = si_and( x_ltz_mask, y_ltz_mask );
    const qword zadd0       = si_selb( f_zero, f_pi, x_ltz_mask );
    const qword zadd        = si_selb( zadd0, f_npi, xy_ltz_mask );
    const qword x_r0        = si_frest( x );
    const qword x_r1        = si_fnms( x_r0, x, f_one );
    const qword x_r         = si_fma( x_r1, x_r0, x_r0 );
    const qword yox         = si_fm( y, x_r );
    const qword atan_yox    = _cp_fatan( yox );
    const qword result      = si_fa( zadd, atan_yox );

    return (result);
}

vector float cp_fatan2( vector float arg0 /* y */, vector float arg1 /* x */ )
{
    const qword y           = (qword)arg0;
    const qword x           = (qword)arg1;
    const qword result      = _cp_fatan2( y, x );

    return (vector float)(result);
}

float cp_fatan2_scalar( float arg0 /* y */, float arg1 /* x */ )
{
    const qword y           = si_from_float( arg0 );
    const qword x           = si_from_float( arg1 );
    const qword z           = _cp_fatan2( y, x );
    const float result      = si_to_float( z );

    return( result );
}

#endif /* CP_FATAN_CBE_SPU_H */

