代码的写法,参考来源是这本书: Machine Learning with R, 2nd Edition.pdf
相关的资源我已经上传了, 包括代码,数据,以及这行本书。
下载链接--免积分下载。 https://download.csdn.net/download/waterHBO/89675687
1. 第一个例子,代码和过程,全部来自书上
r
# 我根据书中第三章 KNN 的内容来做的。
# 第 3 章, KNN, K-Nearest Neighbors 第88页
# 获取当前的工作目录
getwd()
# 修改这里,设置为你自己的工作文件夹地址。
# 把代码和数据都放到这个文件夹里面。
setwd("G:/RRRRR/8月18日-KNN")
# 安装包,导入包
# install.packages("class")
# install.packages("gmodels")
library(class) # use knn()
library(gmodels) # use CrossTable()
# Measuring similarity with distance
# min-max normalization
# z-score standardization
# Although k-NN classifiers may be considered lazy, they are still quite powerful.
# As you will soon see, the simple principles of nearest neighbor learning can be used to
# automate the process of screening for cancer.
# diagnosing breast cancer with KNN
# ------------ Step 1 --collecting data ------------
# 原始数据的链接是
# https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/wisc_bc_data.csv
# ------------ Step 2 -- exploring and preparing the data ------------
wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
# 查看所有的 列信息。
str(wbcd)
# 删除第一列,即 id
wbcd <- wbcd[-1]
# 查看目标列的分布情况
table(wbcd$diagnosis)
# 把目标转为数值类型,
wbcd$diagnosis<- factor(wbcd$diagnosis, levels = c("B", "M"),
labels = c("Benign", "Malignant"))
# 查看比例
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
# 检查其中3个特征
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
# 这3列的数值大小,区别很大。
# Transformation -- normalizing numeric data
# 对数值大小进行归一化
# 创建一个函数
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))
# 使用 lapply() 对这30个特征进行归一化
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n$area_mean)
# Data preparation -- creating training and test datasets
# 把数据分割为: 训练集和测试集
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
# ------------ Step 3 -- training a model on the data ------------
# install.packages("class")
# install.packages("gmodels")
library(class) # use knn()
library(gmodels) # use CrossTable()
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 21)
# ------------ Step 4 -- evaluating model performance ------------
# 评估模型。
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq=FALSE)
# 此时的准确率是 (77+21)/ 100 = 98%
# Step 5 -- improving model performance
# 改进模型,总共就2个方法。
# 第一个是,使用 z-score
# 第二个是,使用不同的 K
# Transformation -- z-score standardization
wbcd_z <- as.data.frame(scale(wbcd[-1]))
summary(wbcd_z$area_mean)
# 再次分割数据集。
wbcd_train <- wbcd_z[1:469, ]
wbcd_test <- wbcd_z[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
# 再次训练
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 21)
# 查看结果
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
# 此时还是 98%
# Testing alternative values of k 测试不同的 k
# k=5, 96%
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 5)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
# k=15, 98%
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 15)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
# 结论是最好是使用 min-max standardization
# K 的值,最好是大于 15
2. 第二个例子, 修改的地方其实并不多。
r
# Diagnosing Heart Failure with KNN
# 我选取的数据集是 Heart Failure Clinical Records
# 来源是: https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records
# 继续使用 KNN 来做这个任务。
# 获取当前的工作目录
getwd()
# 设置工作目录
setwd("G:/RRRRR/8月18日-KNN")
#导入包
library(class) # use knn()
library(gmodels) # use CrossTable()
# ------------ Step 1 --collecting data ------------
# 原始数据的链接是
# https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records
# ------------ Step 2 -- exploring and preparing the data ------------
heart <- read.csv("heart_failure_clinical_records_dataset.csv", stringsAsFactors = FALSE)
# 此数据中,
# 1. 总共有 299 行,即,299个记录。
# 2. 总共有13列,12个特征。都是数值类型的。而且没有缺失值。
# 3. 目标列是最后一列,即 DEATH_EVENT,即,此病例是否死亡。
# 查看所有的 列信息。
str(heart)
# 查看目标列的分布情况
table(heart$DEATH_EVENT)
# 203 96, 大概是 2:1
# 查看比例
round(prop.table(table(heart$DEATH_EVENT)) * 100, digits = 1)
# 67.9 32.1
# 检查其中3个特征
summary(heart[c("age", "platelets", "diabetes")])
# 这3列的数值大小,区别很大。
# Transformation -- normalizing numeric data
# 对数值大小进行归一化
# 创建一个函数
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# 使用 lapply() 对这12个特征进行归一化
heart_n <- as.data.frame(lapply(heart[1:12], normalize))
str(heart_n)
# 随便检查一列
summary(heart_n$age)
# Data preparation -- creating training and test dataset
# 把数据分割为: 训练集和测试集
heart_train <- heart_n[1:199, ]
heart_test <- heart_n[200:299, ] # 测试集,如果给100个,计算准确度很方便。
heart_train_labels <- heart[1:199, 13] # 这里的13,表示第13列,最后一列。
heart_test_labels <- heart[200:299, 13]
# heart_train_labels
# ------------ Step 3 -- training a model on the data ------------
# install.packages("class")
# install.packages("gmodels")
library(class) # use knn()
library(gmodels) # use CrossTable()
# 初步尝试,k 的值,可以设定为 square_root(299) = 17
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 17)
# ------------ Step 4 -- evaluating model performance ------------
# 评估模型。
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq=FALSE)
# 找到表格中 86,7,6, 1, 这4个数。
# 分类正确的是 86 和 1,准确率是 (86+1)/ 100 = 87%
# 分类错误的是 7 和6, 准确率是 (100 - 6 -7) / 100 = 87%
# Step 5 -- improving model performance
# 改进模型,总共就2个方法。
# 第一个是,使用 z-score
# 第二个是,使用不同的 K
# Transformation -- z-score standardization
# [, 1:12], 表示选择所有的行,选择 1--12列,作为特征列
heart_z <- as.data.frame(scale(heart[, 1:12]))
str(heart_z)
summary(heart_z$age)
# 再次分割数据集。
heart_train <- heart_z[1:199, ]
heart_test <- heart_z[200:299, ]
heart_train_labels <- heart[1:199, 13]
heart_test_labels <- heart[200:299, 13]
# 再次训练
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 17)
# 查看结果
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq = FALSE)
# 此时准确率是 (87 + 2) / 100 = 89%, 有所提升。
# Testing alternative values of k 测试不同的 k
# k=5, 准确率是 (82 + 2) / 100 = 84%
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 5)
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq = FALSE)
# k=10, 准确率是 (88 + 4) / 100 = 92%
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 10)
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq = FALSE)
# k=15, 准确率是 (87 + 3) / 100 = 90%
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 15)
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq = FALSE)
# k=20, 准确率是 (88 + 2) / 100 = 90%
heart_test_pred <- knn(train = heart_train, test = heart_test,
cl = heart_train_labels, k = 20)
CrossTable(x = heart_test_labels, y = heart_test_pred,
prop.chisq = FALSE)
# 结论, 最好是使用
# 1. 最好是使用 z-score standardization
# 2. K 的值,最好是定为 20