当前NVIDA的GPU芯片仅支持单精度(float)浮点运算,对一些应用来说精度可能不够用,一些关键的步骤可能需要双精度运算,才能保证程序的正常执行。对此,本人尝试用两个单精度浮点数数来代表一个双精度浮点数:
//类型定义, 对于GPU编程,我们可以直接用内置类型float2,
这里的定义只是为了清晰起见。
- typedef struct { float x, y;} float2
- 双精度和float2之间的转换
//双精度到float2的转换(有溢出的风险)
float2 double_to_float2(double val)
{
float2 ret;
ret.x = (float)val;
ret.y = (float)(val - (double)ret.x);
return ret;
}
//float2到双精度的转换
double float2_to_double(float2 val)
{
double ret;
ret = val.x;
ret += val.y;
return ret;
}
下一步就是关于 float2
类型的加减乘除的运算了(在GPU上实现)。说明,浮点运算中,括号是至关重要的;浮点数运算不满足结合律。
//加法: float2 + float2
static inline __device__ float2 add(float2 dsa, float 2 dsb)
{
float2
dsc;
float t1,
t2, e;
t1 =
dsa.x + dsb.x;
e = t1 -
dsa.x;
t2 =
((dsb.x -e) + ( dsa.x - (t1 -e))) + dsa.y + dsb.y;
dsc.x =
t1 + t2;
dsc.y =
t2 -(dsc.x -t1);
return
dsc;
}
//减法: float2 - float2
static inline __device__ float2 sub(float2 dsa, float2 dsb)
{
float2 dsc;
float t1, t2, e;
t1 = dsa.x - dsb.x;
e = t1 - dsa.x;
t2 = ((-dsb.x -e) + ( dsa.x - (t1 -e))) + dsa.y -
dsb.y;
dsc.x = t1 + t2;
dsc.y = t2 -(dsc.x -t1);
return dsc;
}
#define split 8139.0
//乘法: float2 * float2
static inline __device__ float2 mul(float2 dsa, float2 dsb)
{
float2
dsc;
float a1, a2, b1, b2, cona, conb, c11, c21, c2, e, t1, t2;
cona = dsa.x * split;
conb
= dsb.x * split;
a1 = cona - ( cona - dsa.x);
b1 = conb - ( conb - dsb.x);
a2 = dsa.x - a1;
b2 = dsb.x - b1;
c11 = dsa.x * dsb.x ;
c21 = (((a1*b1 - c11)+a1*b2)+a2*b1)+a2*b2;
c2 = dsa.x * dsb.y + dsa.y * dsb.x;
t1 = c11 + c2;
e = t1 - c11;
t2 = ((c2-e) + (c11 - (t1 -e))) + c21 + dsa.y * dsb.y;
dsc.x = t1 + t2;
dsc.y
= t2 - ( dsc.x -t1);
return dsc;
}
//除法: float2 /
float2
static inline __device__ float2 div(float2 dsa, float2 dsb)
{
float2 dsc;
float a1, a2, b1, b2, cona, conb, c11, c2, c21, e, s1, s2;
float t1, t2, t11, t12, t21, t22;
s2 = 1.0f/dsb.x;
s1 = dsa.x * s2;
cona = s1 * split;
conb = dsb.x * split;
a1 = cona - (cona - s1);
b1 = conb - (conb - dsb.x);
a2 = s1 -a1;
b2 = dsb.x - b1;
c11 = s1 *dsb.x;
c21 = (((a1*b1 -c11)+a1*b2)+a2*b1)+a2*b2;
c2 = s1*dsb.y;
t1 = c11 + c2;
e = t1 - c11;
t2 = ((c2-e)+(c11-(t1-e))) + c21;
t12 = t1 + t2;
t22 = t2 - (t12 - t1);
t11 = dsa.x - t12;
e = t11 -dsa.x;
t21 = ((-t12-e)+(dsa.x - (t11-e))) + dsa.y - t22;
s2 *= (t11+t21);
dsc.x = s1 + s2;
dsc.y = s2 - ( dsc.x - s1);
return dsc;
}