13.2 分组描述统计
sql
mysql> select age, count(score) as n,
-> sum(score) as sum,
-> min(score) as minimum,
-> max(score) as maximum,
-> avg(score) as mean,
-> stddev_samp(score) as 'std. dev.',
-> var_samp(score) as 'variance'
-> from testscore
-> group by age;
+-----+---+------+---------+---------+--------+--------------------+--------------------+
| age | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
| 5 | 4 | 22 | 4 | 7 | 5.5000 | 1.2909944487358056 | 1.6666666666666667 |
| 6 | 4 | 27 | 4 | 9 | 6.7500 | 2.217355782608345 | 4.916666666666667 |
| 7 | 4 | 30 | 6 | 9 | 7.5000 | 1.2909944487358056 | 1.6666666666666667 |
| 8 | 4 | 32 | 6 | 10 | 8.0000 | 1.8257418583505538 | 3.3333333333333335 |
| 9 | 4 | 35 | 7 | 10 | 8.7500 | 1.2583057392117918 | 1.583333333333334 |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
5 rows in set (0.03 sec)
mysql> select sex, count(score) as n,
-> sum(score) as sum,
-> min(score) as minimum,
-> max(score) as maximum,
-> avg(score) as mean,
-> stddev_samp(score) as 'std. dev.',
-> var_samp(score) as 'variance'
-> from testscore
-> group by sex;
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| sex | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| M | 10 | 71 | 4 | 9 | 7.1000 | 1.7919573407620817 | 3.2111111111111112 |
| F | 10 | 75 | 4 | 10 | 7.5000 | 1.9578900207451218 | 3.8333333333333335 |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
2 rows in set (0.00 sec)
mysql> select age, sex, count(score) as n,
-> sum(score) as sum,
-> min(score) as minimum,
-> max(score) as maximum,
-> avg(score) as mean,
-> stddev_samp(score) as 'std. dev.',
-> var_samp(score) as 'variance'
-> from testscore
-> group by age, sex;
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
| age | sex | n | sum | minimum | maximum | mean | std. dev. | variance |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
| 5 | M | 2 | 9 | 4 | 5 | 4.5000 | 0.7071067811865476 | 0.5 |
| 5 | F | 2 | 13 | 6 | 7 | 6.5000 | 0.7071067811865476 | 0.5 |
| 6 | M | 2 | 17 | 8 | 9 | 8.5000 | 0.7071067811865476 | 0.5 |
| 6 | F | 2 | 10 | 4 | 6 | 5.0000 | 1.4142135623730951 | 2 |
| 7 | M | 2 | 14 | 6 | 8 | 7.0000 | 1.4142135623730951 | 2 |
| 7 | F | 2 | 16 | 7 | 9 | 8.0000 | 1.4142135623730951 | 2 |
| 8 | M | 2 | 15 | 6 | 9 | 7.5000 | 2.1213203435596424 | 4.5 |
| 8 | F | 2 | 17 | 7 | 10 | 8.5000 | 2.1213203435596424 | 4.5 |
| 9 | M | 2 | 16 | 7 | 9 | 8.0000 | 1.4142135623730951 | 2 |
| 9 | F | 2 | 19 | 9 | 10 | 9.5000 | 0.7071067811865476 | 0.5 |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
10 rows in set (0.00 sec)
13.3 产生频率分布
sql
mysql> select score, count(score) as occurence
-> from testscore group by score;
+-------+-----------+
| score | occurence |
+-------+-----------+
| 5 | 1 |
| 4 | 2 |
| 6 | 4 |
| 7 | 4 |
| 8 | 2 |
| 9 | 5 |
| 10 | 2 |
+-------+-----------+
7 rows in set (0.00 sec)
mysql> select @n := count(score) from testscore;
+--------------------+
| @n := count(score) |
+--------------------+
| 20 |
+--------------------+
1 row in set, 1 warning (0.01 sec)
mysql> select score, (count(score) * 100)/@n as percent
-> from testscore group by score;
+-------+---------+
| score | percent |
+-------+---------+
| 5 | 5.0000 |
| 4 | 10.0000 |
| 6 | 20.0000 |
| 7 | 20.0000 |
| 8 | 10.0000 |
| 9 | 25.0000 |
| 10 | 10.0000 |
+-------+---------+
7 rows in set (0.00 sec)
mysql> select score, repeat('*', count(score)) as occurrences
-> from testscore group by score;
+-------+-------------+
| score | occurrences |
+-------+-------------+
| 5 | * |
| 4 | ** |
| 6 | **** |
| 7 | **** |
| 8 | ** |
| 9 | ***** |
| 10 | ** |
+-------+-------------+
7 rows in set (0.00 sec)
mysql> select @n := count(score) from testscore;
+--------------------+
| @n := count(score) |
+--------------------+
| 20 |
+--------------------+
1 row in set, 1 warning (0.00 sec)
mysql> select score, repeat('*', (count(score)*100)/@n) as percent
-> from testscore group by score;
+-------+---------------------------+
| score | percent |
+-------+---------------------------+
| 5 | ***** |
| 4 | ********** |
| 6 | ******************** |
| 7 | ******************** |
| 8 | ********** |
| 9 | ************************* |
| 10 | ********** |
+-------+---------------------------+
7 rows in set (0.00 sec)
mysql> drop table if exists ref;
Query OK, 0 rows affected (0.03 sec)
mysql> create table ref(score int);
Query OK, 0 rows affected (0.04 sec)
mysql> insert into ref(score)
-> values(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
Query OK, 11 rows affected (0.02 sec)
Records: 11 Duplicates: 0 Warnings: 0
mysql> select ref.score, count(testscore.score) as occurences
-> from ref left join testscore on ref.score = testscore.score
-> group by ref.score;
+-------+------------+
| score | occurences |
+-------+------------+
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 2 |
| 5 | 1 |
| 6 | 4 |
| 7 | 4 |
| 8 | 2 |
| 9 | 5 |
| 10 | 2 |
+-------+------------+
11 rows in set (0.00 sec)
mysql> select ref.score, (count(testscore.score)*100)/@n as percent
-> from ref left join testscore on ref.score = testscore.score
-> group by ref.score;
+-------+---------+
| score | percent |
+-------+---------+
| 0 | 0.0000 |
| 1 | 0.0000 |
| 2 | 0.0000 |
| 3 | 0.0000 |
| 4 | 10.0000 |
| 5 | 5.0000 |
| 6 | 20.0000 |
| 7 | 20.0000 |
| 8 | 10.0000 |
| 9 | 25.0000 |
| 10 | 10.0000 |
+-------+---------+
11 rows in set (0.00 sec)
13.4 计数缺失值
sql
mysql> -- 创建表
mysql> CREATE TABLE subject_scores (
-> subject INT,
-> score INT NULL
-> );
Query OK, 0 rows affected (0.06 sec)
mysql>
mysql> -- 插入示例数据
mysql> INSERT INTO subject_scores VALUES
-> (1, 38), (2, NULL), (3, 47),
-> (4, 82), (5, NULL), (6, 65),
-> (7, 90), (8, 73), (9, NULL),
-> (10, 55), (11, 68), (12, 79);
Query OK, 12 rows affected (0.01 sec)
Records: 12 Duplicates: 0 Warnings: 0
mysql>
mysql> -- 查询
mysql> SELECT * FROM subject_scores ORDER BY subject;
+---------+-------+
| subject | score |
+---------+-------+
| 1 | 38 |
| 2 | NULL |
| 3 | 47 |
| 4 | 82 |
| 5 | NULL |
| 6 | 65 |
| 7 | 90 |
| 8 | 73 |
| 9 | NULL |
| 10 | 55 |
| 11 | 68 |
| 12 | 79 |
+---------+-------+
12 rows in set (0.00 sec)
mysql> SELECT
-> COUNT(*) AS 'n (total)',
-> COUNT(score) AS 'n (nonmissing)',
-> COUNT(*) - COUNT(score) AS 'n (missing)',
-> ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'
-> FROM subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
| 12 | 9 | 3 | 25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)
mysql> select count(*) as 'n (total)',
-> count(score) as 'n (nonmissing)',
-> sum(isnull(score)) as 'n (missing)',
-> (sum(isnull(score))* 100) / count(*) as '% missing'
-> from subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
| 12 | 9 | 3 | 25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)
如果您确实需要按某些条件分组:
需要先确定分组依据。例如,如果:
• 科目1-6是A组
• 科目7-12是B组
可以这样写:
sql
mysql> SELECT
-> CASE WHEN subject BETWEEN 1 AND 6 THEN 'A' ELSE 'B' END AS group_name,
-> COUNT(*) AS 'n (total)',
-> COUNT(score) AS 'n (nonmissing)',
-> COUNT(*) - COUNT(score) AS 'n (missing)',
-> ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'
-> FROM subject_scores
-> GROUP BY group_name;
+------------+-----------+----------------+-------------+-----------+
| group_name | n (total) | n (nonmissing) | n (missing) | % missing |
+------------+-----------+----------------+-------------+-----------+
| A | 6 | 4 | 2 | 33.3333 |
| B | 6 | 5 | 1 | 16.6667 |
+------------+-----------+----------------+-------------+-----------+
2 rows in set (0.00 sec)
13.5 计算线性回归和相关系数
sql
mysql> select age, score from testscore;
+-----+-------+
| age | score |
+-----+-------+
| 5 | 5 |
| 5 | 4 |
| 5 | 6 |
| 5 | 7 |
| 6 | 8 |
| 6 | 9 |
| 6 | 4 |
| 6 | 6 |
| 7 | 8 |
| 7 | 6 |
| 7 | 9 |
| 7 | 7 |
| 8 | 9 |
| 8 | 6 |
| 8 | 7 |
| 8 | 10 |
| 9 | 9 |
| 9 | 7 |
| 9 | 10 |
| 9 | 9 |
+-----+-------+
20 rows in set (0.00 sec)
mysql> SELECT
-> @n := COUNT(score) AS n,
-> @meanx := AVG(age) AS 'x mean',
-> @sumx := SUM(age) AS 'x sum',
-> @sumxx := SUM(age * age) AS 'x sum of squares',
-> @meany := AVG(score) AS 'y mean',
-> @sumy := SUM(score) AS 'y sum',
-> @sumyy := SUM(score * score) AS 'y sum of squares',
-> @sumxy := SUM(age * score) AS 'x*y sum'
-> FROM testscore\G
*************************** 1. row ***************************
n: 20
x mean: 7.0000
x sum: 140
x sum of squares: 1020
y mean: 7.3000
y sum: 146
y sum of squares: 1130
x*y sum: 1053
1 row in set, 8 warnings (0.00 sec)
mysql> select
-> @b := (@n * @sumxy - @sumx * @sumy)/ (@n * @sumxx - @sumx * @sumx)
-> as slope;
+-------------+
| slope |
+-------------+
| 0.775000000 |
+-------------+
1 row in set, 1 warning (0.00 sec)
mysql> select @a := (@meany - @b * @meanx) as intercept;
+----------------------+
| intercept |
+----------------------+
| 1.875000000000000000 |
+----------------------+
1 row in set, 1 warning (0.00 sec)
mysql> select concat('y =', @b, 'x + ', @a) as 'least-squares regression';
+----------------------------------------+
| least-squares regression |
+----------------------------------------+
| y =0.775000000x + 1.875000000000000000 |
+----------------------------------------+
1 row in set (0.00 sec)
mysql> select
-> (@n * @sumxy - @sumx * @sumy)
-> /sqrt((@n * @sumxx - @sumx * @sumx) * (@n * @sumyy - @sumy * @sumy ))
-> as correlation;
+--------------------+
| correlation |
+--------------------+
| 0.6117362044219903 |
+--------------------+
1 row in set (0.00 sec)