C语言自动进行独立样本 t 检验

源码:https://github.com/Illusionna/tiny-stats/blob/main/auto_ttest.c

data.txt 数据集样例,检验维度是 gender 和 label 两个变量,首先进行 Levene 方差齐性检验,然后再决定是 Student T 检验(样本量需要足够大,否则要满足正态性)还是 Welch T 检验:

cpp 复制代码
gender    b01    b11    label
女        是      有     5
男        是      有     5
男	     不是     有     4
女        是      有     3
女        是      有     5
cpp 复制代码
/*
    gcc -O2 auto_ttest.c -o ttest -lm
    ./ttest -i=data.txt -d=gender,label
*/


#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


typedef struct {
    double *samples;
    int n;
    double mean;
    double var;
} Group;


typedef struct {
    double f;
    double f_p_value;
    int equal_var;
    double t;
    double df;
    double p;
    double diff;
    double cohen;
    double pooled_var;
    double ci_upper;
    double ci_lower;
} Indicator;


double random_normal(double mu, double sigma) {
    static double U;
    static double V;
    static int phase = 0;
    double Z;

    if (phase == 0) {
        U = (rand() + 1.0) / (RAND_MAX + 1.0);
        V = (rand() + 1.0) / (RAND_MAX + 1.0);
        Z = sqrt(-2.0 * log(U)) * sin(2.0 * M_PI * V);
    } else {
        Z = sqrt(-2.0 * log(U)) * cos(2.0 * M_PI * V);
    }
    phase = 1 - phase;

    return Z * sigma + mu;
}


int compare_number(const void *a, const void *b) {
    double diff = *(double *)a - *(double *)b;
    return (diff > 0) - (diff < 0);
}


double median(double *data, int n) {
    double *cache = (double *)malloc(n * sizeof(double));
    for (int i = 0; i < n; i++) cache[i] = data[i];
    qsort(cache, n, sizeof(double), compare_number);
    double result;
    if (n % 2 == 0) result = (cache[n / 2 - 1] + cache[n / 2]) / 2.0;
    else result = cache[n / 2];
    free(cache);
    return result;
}


double levene_f(double *group1, double *group2, int n1, int n2) {
    int N = n1 + n2;
    double **Z = (double **)malloc(2 * sizeof(double *));

    double *group_means_Z = (double *)malloc(2 * sizeof(double));
    double grand_mean_Z = 0;
    double mid;

    mid = median(group1, n1);
    Z[0] = (double *)malloc(n1 * sizeof(double));
    double sum_Z0 = 0.0;
    for (int i = 0; i < n1; i++) {
        Z[0][i] = fabs(group1[i] - mid);
        sum_Z0 = sum_Z0 + Z[0][i];
    }
    group_means_Z[0] = sum_Z0 / n1;
    grand_mean_Z = grand_mean_Z + sum_Z0;

    mid = median(group2, n2);
    Z[1] = (double *)malloc(n2 * sizeof(double));
    double sum_Z1 = 0.0;
    for (int i = 0; i < n2; i++) {
        Z[1][i] = fabs(group2[i] - mid);
        sum_Z1 = sum_Z1 + Z[1][i];
    }
    group_means_Z[1] = sum_Z1 / n2;
    grand_mean_Z = grand_mean_Z + sum_Z1;

    grand_mean_Z = grand_mean_Z / N;

    double ssb = 0.0;
    double ssw = 0.0;

    ssb = ssb + n1 * pow(group_means_Z[0] - grand_mean_Z, 2);
    for (int i = 0; i < n1; i++) ssw = ssw + pow(Z[0][i] - group_means_Z[0], 2);
    ssb = ssb + n2 * pow(group_means_Z[1] - grand_mean_Z, 2);
    for (int i = 0; i < n2; i++) ssw = ssw + pow(Z[1][i] - group_means_Z[1], 2);

    double msb = ssb / (2 - 1);
    double msw = ssw / (N - 2);
    double f = msb / msw;

    for (int i = 0; i < 2; i++) free(Z[i]);
    free(Z);
    free(group_means_Z);
    return f;
}


double incbeta(double a, double b, double x) {
    static const double epsilon = 1.0e-30;
    static const double condition = 1.0e-12;

    if (x < 0.0 || x > 1.0) return INFINITY;
    if (x > (a + 1.0) / (a + b + 2.0)) return 1.0 - incbeta(b, a, 1.0 - x);

    double f = 1.0;
    double c = 1.0;
    double d = 0.0;
    const double lbeta_ab = lgamma(a) + lgamma(b) - lgamma(a + b);
    const double front = exp(a * log(x) + b * log(1.0 - x) - lbeta_ab) / a;

    for (int i = 0; i <= 200; ++i) {
        int m = i / 2;
        double numerator;

        if (i == 0) numerator = 1.0;
        else if (i % 2 == 0) numerator = (m * (b - m) * x) / ((a + 2.0 * m - 1.0) * (a + 2.0 * m));
        else numerator = - ((a + m) * (a + b + m) * x) / ((a + 2.0 * m) * (a + 2.0 * m + 1));

        d = 1.0 + numerator * d;
        if (fabs(d) < epsilon) d = epsilon;
        d = 1.0 / d;

        c = 1.0 + numerator / c;
        if (fabs(c) < epsilon) c = epsilon;

        double cd = c * d;
        f = f * cd;
        if (fabs(1.0 - cd) < condition) return front * (f - 1.0);
    }
    return INFINITY;
}


double ppf_normal_approx(double p) {
    if (p < 0.5) return -ppf_normal_approx(1.0 - p);

    double c0 = 2.515517;
    double c1 = 0.802853;
    double c2 = 0.010328;
    double d1 = 1.432788;
    double d2 = 0.189269;
    double d3 = 0.001308;

    double t = sqrt(-2.0 * log(1.0 - p));
    return t - ((c2 * t + c1) * t + c0) / (((d3 * t + d2) * t + d1) * t + 1.0);
}


double cornish_fisher(double p, double df) {
    double z = ppf_normal_approx(p);
    if (isinf(df)) return z;
    double t = z + (pow(z, 3) + z) / (4.0 * df);
    if (df < 32) t = t + (5.0 * pow(z, 5) + 16.0 * pow(z, 3) + 3.0 * z) / (96.0 * df * df);
    return t;
}


double cdf_student_t(double t, double df) {
    if (df <= 0.0) return NAN;
    if (t == 0.0) return 0.5;
    double cache = sqrt(t * t + df);
    return incbeta(df / 2.0, df / 2.0, (t + cache) / (2.0 * cache));
}


double pdf_student_t(double t, double df) {
    double lbeta_half = lgamma(df / 2.0) + lgamma(0.5) - lgamma((df + 1.0) / 2.0);
    return exp(-lbeta_half) * pow(1.0 + (t * t) / df, -(df + 1.0) / 2.0) / sqrt(df);
}


double ppf_student_t(double p, double df) {
    if (p <= 0.0) return -INFINITY;
    if (p >= 1.0) return INFINITY;
    if (p == 0.5) return 0.0;
    if (df < 1.0) return NAN;

    double epsilon = 1e-12;
    int epochs = 100;

    /*
    double t = (p > 0.5) ? 1.0 : -1.0;
    */
    double t = cornish_fisher(p, df);

    for (int i = 0; i < epochs; i++) {
        double cdf = cdf_student_t(t, df);
        double pdf = pdf_student_t(t, df);
        if (pdf < 1e-100) break;
        // Newton-Raphson.
        double delta = (cdf - p) / pdf;
        t = t - delta;
        if (fabs(delta) < epsilon) return t;
    }
    return t;
}


void ttest_levene(Indicator *indicator, double *g1, double *g2, int n1, int n2) {
    double f = levene_f(g1, g2, n1, n2);
    indicator->f = f;
    if (f <= 0) {
        indicator->f_p_value = 1.0;
    } else {
        double df1 = 2.0 - 1.0;
        double df2 = (double)n1 + (double)n2 -2.0;
        double x = df2 / (df2 + df1 * f);
        indicator->f_p_value = incbeta(df2 / 2.0, df1 / 2.0, x);
    }
}


void ttest_independent(Indicator *indicator, Group *group1, Group *group2, double alpha) {
    ttest_levene(indicator, group1->samples, group2->samples, group1->n, group2->n);
    if (indicator->f_p_value > alpha) indicator->equal_var = 1;
    else indicator->equal_var = 0;

    double t;
    double df;

    if (indicator->equal_var) {
        df = group1->n + group2->n - 2.0;
        double pooled_var = ((group1->n - 1.0) * group1->var + (group2->n - 1.0) * group2->var) / df;
        t = (group1->mean - group2->mean) / sqrt(pooled_var * (1.0 / group1->n + 1.0 / group2->n));
    } else {
        double satterthwaite1 = group1->var / group1->n;
        double satterthwaite2 = group2->var / group2->n;
        t = (group1->mean - group2->mean) / sqrt(satterthwaite1 + satterthwaite2);
        double numerator = (satterthwaite1 + satterthwaite2) * (satterthwaite1 + satterthwaite2);
        double denominator = (satterthwaite1 * satterthwaite1) / (group1->n - 1.0) + (satterthwaite2 * satterthwaite2) / (group2->n - 1.0);
        df = numerator / denominator;
    }

    double cdf = cdf_student_t(t, df);
    double p = 2.0 * (cdf > 0.5 ? 1.0 - cdf : cdf);

    double pooled_std = sqrt(((group1->n - 1) * group1->var + (group2->n - 1) * group2->var) / df);
    double pooled_var = pooled_std * pooled_std;
    double cohen = (group1->mean - group2->mean) / pooled_std;

    double diff = group1->mean - group2->mean;
    double se_diff = pooled_std * sqrt(1.0 / group1->n + 1.0 / group2->n);
    double t_critical = ppf_student_t(1.0 - alpha / 2.0, df);

    indicator->df = df;
    indicator->t = t;
    indicator->p = p;
    indicator->diff = diff;
    indicator->pooled_var = pooled_var;
    indicator->cohen = cohen;
    indicator->ci_lower = diff - t_critical * se_diff;
    indicator->ci_upper = diff + t_critical * se_diff;
}


double ttest_paired(double *state1, double *state2, int n) {
    if (n < 2) return NAN;

    double sum_diff = 0.0;
    double sum_square_diff = 0.0;
    for (int i = 0; i < n; ++i) {
        double diff = state1[i] - state2[i];
        sum_diff = sum_diff + diff;
        sum_square_diff = sum_square_diff + diff * diff;
    }

    double mean_diff = sum_diff / n;
    double var_diff = (sum_square_diff - (sum_diff * sum_diff / n)) / (n - 1);
    double std_error = sqrt(var_diff / n);

    double t = mean_diff / std_error;
    double cdf = cdf_student_t(t, n - 1.0);
    double p = 2.0 * (cdf > 0.5 ? 1.0 - cdf : cdf);
    return p;
}


void statistics(Group *group) {
    double sum = 0.0;
    double var = 0.0;
    for (int i = 0; i < group->n; ++i) sum = sum + group->samples[i];
    double mean = sum / group->n;
    for (int i = 0; i < group->n; ++i) var = var + (group->samples[i] - mean) * (group->samples[i] - mean);
    group->var = var / (group->n - 1);
    group->mean = mean;
}


int read_file(char *path, char *X, char *Y, Group *group1, Group *group2, char **categories) {
    FILE *f = fopen(path, "r");
    if (!f) return 1;

    char buffer[1024];
    int count;
    int idx = -1;
    int idy = -1;
    const char *delimiters = " \t\r\n";

    if (fgets(buffer, sizeof(buffer), f)) {
        char *token = strtok(buffer, delimiters);
        int n;
        for (n = 0; token != NULL; n++, token = strtok(NULL, delimiters)) {
            if (strcmp(token, X) == 0) idx = n;
            if (strcmp(token, Y) == 0) idy = n;
        }
        count = n;
    }

    if (idx == -1 || idy == -1) return 2;

    int capacity1 = 64;
    int capacity2 = 64;
    group1->samples = (double *)malloc(capacity1 * sizeof(double));
    group2->samples = (double *)malloc(capacity2 * sizeof(double));
    group1->n = 0;
    group2->n = 0;

    int classification = 0;

    while (fgets(buffer, sizeof(buffer), f)) {
        int n;
        char *x = NULL;
        char *y = NULL;
        char *token = strtok(buffer, delimiters);
        for (n = 0; token != NULL; n++, token = strtok(NULL, delimiters)) {
            if (n == idx) x = token;
            if (n == idy) y = token;
        }
        if (count != n) return 3;

        int group_id = -1;
        for (int i = 0; i < classification; i++) {
            if (strcmp(x , categories[i]) == 0) group_id = i;
        }
        if (group_id == -1) {
            if (classification >= 2) {
                fclose(f);
                return 4;
            }
            categories[classification] = strdup(x);
            group_id = classification++;
        }

        if (group_id == 0) {
            if (group1->n >= capacity1) {
                capacity1 = capacity1 * 2;
                group1->samples = (double *)realloc(group1->samples, capacity1 * sizeof(double));
            }
            group1->samples[group1->n++] = atof(y);
        }
        if (group_id == 1) {
            if (group2->n >= capacity2) {
                capacity2 = capacity2 * 2;
                group2->samples = (double *)realloc(group2->samples, capacity2 * sizeof(double));
            }
            group2->samples[group2->n++] = atof(y);
        }
    }

    fclose(f);
    return 0;
}


void print_ttest(char *X, char *Y, char **categories, Group *group1, Group *group2) {
    double alpha = 0.05;
    Indicator indicator;

    statistics(group1);
    statistics(group2);
    ttest_independent(&indicator, group1, group2, alpha);

    printf("\x1b[1mvariable:\x1b[0m %s - %s | alpha = %lf\n", X, Y, alpha);
    printf("levene_f = %.12lf -> \x1b[1m\x1b[32mlevene_p = %.12lf\x1b[0m\n", indicator.f, indicator.f_p_value);
    if (indicator.equal_var == 1) printf("\x1b[1mStudent's t-test\x1b[0m (homogeneity of variance)\n");
    else printf("\x1b[1mWelch's t-test\x1b[0m (heterogeneity of variance)\n");
    printf("\x1b[1m\x1b[4m%-15s %-15s %-15s %-15s\x1b[0m\n", "class", "count", "mean", "variance");
    printf("%-15s %-15d %-15lf %-15lf\n", categories[0], group1->n, group1->mean, group1->var);
    printf("\x1b[4m%-15s %-15d %-15lf %-15lf\x1b[0m\n", categories[1], group2->n, group2->mean, group2->var);
    printf("\x1b[1mmean_difference =\x1b[0m    %lf\n", indicator.diff);
    printf("\x1b[1mpooled_var =\x1b[0m\t     %lf\n", indicator.pooled_var);
    printf("\x1b[1mdegree_freedom =\x1b[0m     %lf\n", indicator.df);
    printf("\x1b[1mt_value =\x1b[0m\t     %lf\n", indicator.t);
    printf("\x1b[1mp_value =\x1b[0m\t     \x1b[1m\x1b[32m%.12lf\x1b[0m\n", indicator.p);
    printf("\x1b[1mCohen's d =\x1b[0m\t     %lf\n", indicator.cohen);
    printf("\x1b[1m95%% CI =\x1b[0m       (%lf ~ %lf)\n", indicator.ci_lower, indicator.ci_upper);
}



int main(int argc, char *argv[], char *envs[]) {
    char path[256];
    char X[64] = {0};
    char Y[64] = {0};

    if (argc == 1) {
        fprintf(stderr, "\x1b[32m(Usage) >>>\x1b[0m %s -i=<filepath> -d=<dim1>,<dim2>\n", argv[0]);
        return 1;
    }

    for (int i = 1; i < argc; i++) {
        if (strncmp(argv[i], "-i=", 3) == 0) strcpy(path, argv[i] + 3);
        else if (strncmp(argv[i], "-d=", 3) == 0) {
            char buffer[256];
            strncpy(buffer, argv[i] + 3, sizeof(buffer));
            char *token = strtok(buffer, ",");
            if (token != NULL) strcpy(X, token);
            token = strtok(NULL, ",");
            if (token != NULL) strcpy(Y, token);
        }
        else {
            fprintf(stderr, "\x1b[31m[Error]\x1b[0m invalid parameter \"%s\"\n", argv[i]);
            fprintf(stderr, "\x1b[32m(Usage) >>>\x1b[0m %s -i=<filepath> -d=<dim1>,<dim2>\n", argv[0]);
            return 1;
        }
    }

    char *categories[2] = {NULL, NULL};
    Group *group1 = (Group *)malloc(sizeof(Group));
    Group *group2 = (Group *)malloc(sizeof(Group));

    int status = read_file(path, X, Y, group1, group2, categories);
    switch (status) {
        case 1:
            printf("\x1b[31m[Error]\x1b[0m can not find the dataset file.\n");
            goto release;
        case 2:
            printf("\x1b[31m[Error]\x1b[0m \"%s\" or \"%s\" is not in the table header.\n", X, Y);
            goto release;
        case 3:
            printf("\x1b[31m[Error]\x1b[0m data is not aligned.\n");
            goto release;
        case 4:
            printf("\x1b[31m[Error]\x1b[0m independent variable \"%s\" is not binary (more than 2 categories).\n", X);
            goto release;
        default:
            break;
    }

    print_ttest(X, Y, categories, group1, group2);

    release:
        free(group1->samples);
        free(group2->samples);
        free(group1);
        free(group2);
        free(categories[0]);
        free(categories[1]);
    return 0;
}
相关推荐
qq_401700417 小时前
C语言 条件编译宏
c语言·开发语言
逑之7 小时前
C语言笔记5:函数
java·c语言·笔记
无限进步_7 小时前
【C语言&数据结构】对称二叉树:镜像世界的递归探索
c语言·开发语言·数据结构·c++·git·算法·visual studio
松涛和鸣8 小时前
49、智能电源箱项目技术栈解析
服务器·c语言·开发语言·http·html·php
凉、介8 小时前
SylixOS 中的 Unix Socket
服务器·c语言·笔记·学习·嵌入式·sylixos
X***07888 小时前
从底层逻辑到工程实践,深入理解C语言在计算机世界中的核心地位与持久价值
c语言·开发语言
智者知已应修善业9 小时前
【编写函数求表达式的值】2024-4-3
c语言·c++·经验分享·笔记·算法
HABuo9 小时前
【Linux进程(四)】进程切换&环境变量深入剖析
linux·运维·服务器·c语言·c++·ubuntu·centos
码农小韩11 小时前
基于Linux的C++学习——动态数组容器vector
linux·c语言·开发语言·数据结构·c++·单片机·学习