[R] Review1 before Self-Project

Mainly work on the command factor, cut, recode, summary, describe, table, levels, as.factor, as.numeric, na_if, rbind

Factor

R 复制代码

factor(x = character(), levels, labels = levels,
       exclude = NA, ordered = is.ordered(x), nmax = NA)



ordered(x = character(), ...)
# 创建一个有序因子
ordered_vector <- ordered(c("Low", "Medium", "High"))



is.factor(x)
is.ordered(x)
# 检查对象是否为因子
is_factor <- is.factor(factor_vector)

# 检查对象是否为有序因子
is_ordered <- is.ordered(ordered_vector)



as.factor(x)
as.ordered(x)
# 将向量转换为因子
as_factor_vector <- as.factor(c("Male", "Female"))

# 将有序向量转换为有序因子
as_ordered_vector <- as.ordered(c("Low", "Medium", "High"))



addNA(x, ifany = FALSE)
# 在因子中添加缺失值水平
factor_with_na <- addNA(factor_vector)



.valid.factor(object)
# 检查对象是否为有效的因子
valid_factor <- .valid.factor(factor_vector)

注意：

在R中，一个有效的因子是指满足以下要求的因子对象：

有限的水平数： 因子应该有一个有限的水平数，即水平的数量应该是有限的，而不是无穷大。
唯一性： 每个水平应该是唯一的，没有重复的水平。
有序性（如果是有序因子）： 如果因子是有序的，那么水平之间应该有一定的顺序关系。
缺失值： 因子中可以包含缺失值，通过NA表示。

cut

R 复制代码

cut(x, ...)

## Default S3 method:
cut(x, breaks, labels = NULL,
    include.lowest = FALSE, right = TRUE, dig.lab = 3,
    ordered_result = FALSE, ...)

notice: breaks are either a numeric vector of two or more unique cut points or a single number (greater than or equal to 2) giving the number of intervals into which x is to be cut.

labels for the levels of the resulting category. By default, labels are constructed using "(a,b]" interval notation.

dig.lab is the integer which is used when labels are not given. It determines the number of digits used in formatting the break numbers.

examples:

R 复制代码

Z <- stats::rnorm(10000)
table(cut(Z, breaks = -6:6))


#(-6,-5] (-5,-4] (-4,-3] (-3,-2] (-2,-1]  (-1,0] 
#     0       1      10     212    1377    3391 
#  (0,1]   (1,2]   (2,3]   (3,4]   (4,5]   (5,6] 
#   3442    1345     201      19       2       0

or:

R 复制代码

gfk_cleaned_eul$birthyear_cat <- cut(gfk_cleaned_eul$birthyear, breaks = c(-Inf, 1945, 1965, 1985, 1997, Inf), labels = c("(-Inf,1945]", "(1945,1964]", "(1965,1984]", "(1985,1996]", "(1997,Inf]"))

Recode

R 复制代码

recode(.x, ..., .default = NULL, .missing = NULL)

recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE)

like:

R 复制代码

coffeenew$gender <- recode(coffeenew$gender,"1"="male","2"="female")

mcs$math=recode(mcs$math,"1"="Strongly Disagree", "2"="Disagree","3"="Agree", "4"="Strongly Agree")

gfk_cleaned_eul$birthyear_cat = recode (gfk_cleaned_eul$birthyear_cat,"(-Inf,1945]"= "born in 1945 or before","(1945,1964]"="Boomers","(1965,1984]"="GenX","(1985,1996]"="Millenium","(1997,Inf]"="GenZ")

gfk_cleaned_eul$mmetal_3cat<-recode(gfk_cleaned_eul$mmetal_3cat,'1'="Like or like a lot",'2'="Like or like a lot",'3'="neither",'4'="dislike or dislike a lot",'5'="dislike or dislike a lot")

summary

R 复制代码

summary(object, nb.dec = 3, nbelements=10,
   ncp = 3, align.names=TRUE, file="", ...)

Like:

R 复制代码

class(USArrests)
[1] "data.frame"
summary(USArrests)

R 复制代码

     Murder          Assault     
 Min.   : 0.800   Min.   : 45.0  
 1st Qu.: 4.075   1st Qu.:109.0  
 Median : 7.250   Median :159.0  
 Mean   : 7.788   Mean   :170.8  
 3rd Qu.:11.250   3rd Qu.:249.0  
 Max.   :17.400   Max.   :337.0  
    UrbanPop          Rape      
 Min.   :32.00   Min.   : 7.30  
 1st Qu.:54.50   1st Qu.:15.07  
 Median :66.00   Median :20.10  
 Mean   :65.54   Mean   :21.23  
 3rd Qu.:77.75   3rd Qu.:26.18  
 Max.   :91.00   Max.   :46.00

Describe:

R 复制代码

describe(UCBAdmissions)

describe(x, na.rm = TRUE, interp=FALSE,skew = TRUE, ranges = TRUE,trim=.1,
              type=3,check=TRUE,fast=NULL,quant=NULL,IQR=FALSE,omit=FALSE,data=NULL)
describeData(x,head=4,tail=4)
describeFast(x)

|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| x | A data frame or matrix |
| na.rm | The default is to delete missing data. na.rm=FALSE will delete the case. |
| interp | Should the median be standard or interpolated |
| skew | Should the skew and kurtosis be calculated? |
| ranges | Should the range be calculated? |
| trim | trim=.1 -- trim means by dropping the top and bottom trim fraction |
| type | Which estimate of skew and kurtosis should be used? (See details.) |
| check | Should we check for non-numeric variables? Slower but helpful. |
| fast | if TRUE, will do n, means, sds, min, max, ranges for an improvement in speed. If NULL, will switch to fast mode for large (ncol * nrow > 10^7) problems, otherwise defaults to fast = FALSE |
| quant | if not NULL, will find the specified quantiles (e.g. quant=c(.25,.75) will find the 25th and 75th percentiles) |
| IQR | If TRUE, show the interquartile range |
| omit | Do not convert non-numerical variables to numeric, omit them instead |
| head | show the first 1:head cases for each variable in describeData |
| tail | Show the last nobs-tail cases for each variable in describeData |
| data | Allows formula input for specific grouping variables |

R 复制代码

> describe(USArrests)
         vars  n   mean    sd median trimmed
Murder      1 50   7.79  4.36   7.25    7.53
Assault     2 50 170.76 83.34 159.00  168.48
UrbanPop    3 50  65.54 14.47  66.00   65.88
Rape        4 50  21.23  9.37  20.10   20.36
            mad  min   max range  skew kurtosis
Murder     5.41  0.8  17.4  16.6  0.37    -0.95
Assault  110.45 45.0 337.0 292.0  0.22    -1.15
UrbanPop  17.79 32.0  91.0  59.0 -0.21    -0.87
Rape       8.60  7.3  46.0  38.7  0.75     0.08
            se
Murder    0.62
Assault  11.79
UrbanPop  2.05
Rape      1.32

table

R 复制代码

table(...,
      exclude = if (useNA == "no") c(NA, NaN),
      useNA = c("no", "ifany", "always"),
      dnn = list.names(...), deparse.level = 1)

as.table(x, ...)
is.table(x)

## S3 method for class 'table'
as.data.frame(x, row.names = NULL, ...,
              responseName = "Freq", stringsAsFactors = TRUE,
              sep = "", base = list(LETTERS))

|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ... | one or more objects which can be interpreted as factors (including numbers or character strings), or a list (such as a data frame) whose components can be so interpreted. (For as.table, arguments passed to specific methods; for as.data.frame, unused.) |
| exclude | levels to remove for all factors in .... If it does not contain NA and useNA is not specified, it implies useNA = "ifany". See 'Details' for its interpretation for non-factor arguments. |
| useNA | whether to include NA values in the table. See 'Details'. Can be abbreviated. |
| dnn | the names to be given to the dimensions in the result (the dimnames names). |
| deparse.level | controls how the default dnn is constructed. See 'Details'. |
| x | an arbitrary R object, or an object inheriting from class "table" for the as.data.frame method. Note that as.data.frame.table(x, *) may be called explicitly for non-table x for "reshaping" arrays. |
| row.names | a character vector giving the row names for the data frame. |
| responseName | The name to be used for the column of table entries, usually counts. |
| stringsAsFactors | logical: should the classifying factors be returned as factors (the default) or character vectors? |
| sep, base | passed to provideDimnames. |

Levels:

复制代码

levels(x)
levels(x) <- value

|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| x | an object, for example a factor. |
| value | A valid value for levels(x). For the default method, NULL or a character vector. For the factor method, a vector of character strings with length at least the number of levels of x, or a named list specifying how to rename the levels. |

like:

R 复制代码

> table(UCBAdmissions)
UCBAdmissions
  8  17  19  22  24  53  89  94 120 131 138 202 
  1   1   1   1   1   1   1   1   1   1   2   1 
205 207 244 279 299 313 317 351 353 391 512 
  1   1   1   1   1   1   1   1   1   1   1 
#this is a mistake

R 复制代码

x <- gl(2, 4, 8)
levels(x)[1] <- "low"
levels(x)[2] <- "high"
x

R 复制代码

## combine some levels
z <- gl(3, 2, 12, labels = c("apple", "salad", "orange"))
z
levels(z) <- c("fruit", "veg", "fruit")
z

这里通过函数gl 生成的因子向量 z 的效果是一个长度为12的因子向量，其中每个水平都按照设定的重复次数重复。在这个例子中，"apple"、"salad"、"orange" 这三个水平分别重复了2次，共计12个因子

R 复制代码

> z
 [1] apple  apple  salad  salad  orange orange
 [7] apple  apple  salad  salad  orange orange
Levels: apple salad orange

然后通过levels函数将三个标签转化成"fruit", "veg", "fruit"，起到合并的效果

R 复制代码

> levels(z) <- c("fruit", "veg", "fruit")
> z
 [1] fruit fruit veg   veg   fruit fruit fruit
 [8] fruit veg   veg   fruit fruit
Levels: fruit veg

或者直接使用，起到输出标签数组的作用：

R 复制代码

> levels(z)
[1] "fruit" "veg"