算法描述:
(1). 先做自变量x的范围检查,不能出现负数和0. 自己使用时,如果能通过其它途径保证自变量为正,那么可以省略这两个判断,提高速度。
(2). 根据IEEE 754浮点数的格式,,则 ln(x)=kln(2)+ln(m),可以通过位运算方便快速地获取k和m .
(3). 把 ln(1+x) 和 ln(1-x) 在 x=0 处的泰勒级数相减,
因为m的范围是(1, 2),不够接近1,如果直接令m=(1+x)/(1-x),那么x不够接近0,代入上面的泰勒级数,则精度不够高,所以要对m进行变换,假设用乘上m,则,将这个区间记为L,令区间L关于原点对称,即
解得,但这个值并不完美,因为它不能使区间L长度达到最短(时区间L最短),相反地,它恰好使区间L的长度达到最大。感兴趣的还可以令=2/3或3/4试试效果。
多项式求值采用秦九韶算法,同时还使用fmadd指令加速运算(融合乘加,intel _mm_fmadd_sd)
计算机如何计算对数函数_数值计算】求对数函数值,输入实数x>0 ,输出x对应的对数函数值ln(x)(使用双精度dou-CSDN博客更详细地解释了如何利用IEEE 754浮点数的格式获取k和m.
标准库的算法可参考:glibc/sysdeps/ieee754/dbl-64/s_log1p.c at master · bminor/glibc · GitHub
最终的效果是,精度稍差于标准库,如果不对自变量为NAN的情况进行处理,速度稍快于标准库。
C代码如下:
cpp
#include<stdio.h>
#include<math.h>
#include<time.h>
#include<immintrin.h>
#define FMADD
constexpr double ln2 = 0.6931471805599453;
constexpr double ln3_2 = 0.40546510810816438; // ln(3/2)
constexpr double sqrt2_2 = 0.7071067811865475; // sqrt(2)/2
constexpr unsigned long long x000F = 0x000FFFFFFFFFFFFF;
constexpr unsigned long long x3FF0 = 0x3FF0000000000000;
__m128d c17 = _mm_set_sd(2.0 / 17.0);
__m128d c15 = _mm_set_sd(2.0 / 15.0);
__m128d c13 = _mm_set_sd(2.0 / 13.0);
__m128d c11 = _mm_set_sd(2.0 / 11.0);
__m128d c9 = _mm_set_sd(2.0 / 9.0);
__m128d c7 = _mm_set_sd(2.0 / 7.0);
__m128d c5 = _mm_set_sd(2.0 / 5.0);
__m128d c3 = _mm_set_sd(2.0 / 3.0);
__m128d c1 = _mm_set_sd(2.0);
inline double myln(double x) {
if (x < 0) {
return NAN;
}
if (x == 0) {
return -INFINITY;
}
unsigned long long llx = *reinterpret_cast<unsigned long long*>(&x);
short k = (llx >> 52) - 1023; // x = 2^k * m
llx = (llx & x000F) | x3FF0;
double m = *reinterpret_cast<double*>(&llx);
m *= sqrt2_2;
// m*=0.66666666666666666;
x = (m - 1.0) / (m + 1.0);
double x2 = x * x;
#ifdef FMADD
__m128d x128 = _mm_set_sd(x);
__m128d x2_128 = _mm_set_sd(x2);
__m128d t128 = c17;
t128 = _mm_fmadd_sd(t128, x2_128, c15);
t128 = _mm_fmadd_sd(t128, x2_128, c13);
t128 = _mm_fmadd_sd(t128, x2_128, c11);
t128 = _mm_fmadd_sd(t128, x2_128, c9);
t128 = _mm_fmadd_sd(t128, x2_128, c7);
t128 = _mm_fmadd_sd(t128, x2_128, c5);
t128 = _mm_fmadd_sd(t128, x2_128, c3);
t128 = _mm_fmadd_sd(t128, x2_128, c1);
t128 = _mm_mul_sd(t128, x128);
m = _mm_cvtsd_f64(t128);
#else
m = 2.0 / 17.0;
m = m * x2 + 2.0 / 15.0;
m = m * x2 + 2.0 / 13.0;
m = m * x2 + 2.0 / 11.0;
m = m * x2 + 2.0 / 9.0;
m = m * x2 + 2.0 / 7.0;
m = m * x2 + 2.0 / 5.0;
m = m * x2 + 2.0 / 3.0;
m = m * x2 + 2.0;
m *= x;
#endif
return (k + 0.5) * ln2 + m;
// return k * ln2 + ln3_2 + m; //如果前面m乘了2/3=0.666666......,就用这个return
}
int main() {
printf("double, 精度测试\n");
for (double x = 0.1; x < 3; x += 0.1) {
printf("myln(%2.1f)=%18.16lf\n ln(%2.1f)=%18.16lf\n-------\n", x, myln(x), x, log(x));
}
printf("速度测试,编译器优化设为/O2\n");
clock_t start = clock();
double sum = 0;
double x1 = 0.01, x2 =1000, dx = 1e-6;
for (double x = x1; x < x2; x += dx) {
sum += myln(x) / x;
}
printf("sum=%lf, myln_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
start = clock();
sum = 0;
for (double x = x1; x < x2; x += dx) {
sum += log(x) / x;
}
printf("sum=%lf, ln_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
}
运行结果如下:
cpp
double, 精度测试
myln(0.1)=-2.3025850929940455
ln(0.1)=-2.3025850929940455
-------
myln(0.2)=-1.6094379124341003
ln(0.2)=-1.6094379124341003
-------
myln(0.3)=-1.2039728043259359
ln(0.3)=-1.2039728043259359
-------
myln(0.4)=-0.9162907318741550
ln(0.4)=-0.9162907318741550
-------
myln(0.5)=-0.6931471805599451
ln(0.5)=-0.6931471805599453
-------
myln(0.6)=-0.5108256237659908
ln(0.6)=-0.5108256237659907
-------
myln(0.7)=-0.3566749439387326
ln(0.7)=-0.3566749439387324
-------
myln(0.8)=-0.2231435513142099
ln(0.8)=-0.2231435513142098
-------
myln(0.9)=-0.1053605156578265
ln(0.9)=-0.1053605156578264
-------
myln(1.0)=-0.0000000000000006
ln(1.0)=-0.0000000000000001
-------
myln(1.1)=0.0953101798043246
ln(1.1)=0.0953101798043247
-------
myln(1.2)=0.1823215567939545
ln(1.2)=0.1823215567939546
-------
myln(1.3)=0.2623642644674910
ln(1.3)=0.2623642644674911
-------
myln(1.4)=0.3364722366212130
ln(1.4)=0.3364722366212130
-------
myln(1.5)=0.4054651081081645
ln(1.5)=0.4054651081081646
-------
myln(1.6)=0.4700036292457357
ln(1.6)=0.4700036292457357
-------
myln(1.7)=0.5306282510621705
ln(1.7)=0.5306282510621706
-------
myln(1.8)=0.5877866649021191
ln(1.8)=0.5877866649021193
-------
myln(1.9)=0.6418538861723949
ln(1.9)=0.6418538861723950
-------
myln(2.0)=0.6931471805599456
ln(2.0)=0.6931471805599455
-------
myln(2.1)=0.7419373447293773
ln(2.1)=0.7419373447293776
-------
myln(2.2)=0.7884573603642703
ln(2.2)=0.7884573603642705
-------
myln(2.3)=0.8329091229351041
ln(2.3)=0.8329091229351043
-------
myln(2.4)=0.8754687373539001
ln(2.4)=0.8754687373539003
-------
myln(2.5)=0.9162907318741552
ln(2.5)=0.9162907318741554
-------
myln(2.6)=0.9555114450274366
ln(2.6)=0.9555114450274368
-------
myln(2.7)=0.9932517730102837
ln(2.7)=0.9932517730102838
-------
myln(2.8)=1.0296194171811583
ln(2.8)=1.0296194171811586
-------
myln(2.9)=1.0647107369924287
ln(2.9)=1.0647107369924287
-------
速度测试,编译器优化设为/O2
sum=13254515.057331, myln_Time: 2.645000s
sum=13254515.057331, ln_Time: 2.945000s