R语言详细教程
目录
- R语言简介
- 安装与环境配置
- 基础语法和数据类型
- 数据结构
- 数据导入与导出
- 数据清洗与处理
- 数据可视化
- 统计分析
- 函数编程
- 高级应用
1. R语言简介
什么是R语言?
R是一种开源的编程语言和软件环境,专门用于统计分析、数据挖掘和可视化。由Ross Ihaka和Robert Gentleman于1993年在新西兰奥克兰大学创建。
R的特点
- 免费开源
- 强大的统计分析能力
- 丰富的可视化功能
- 活跃的社区支持
- 大量的扩展包
2. 安装与环境配置
安装R
- 访问CRAN官网:https://cran.r-project.org/
- 选择对应操作系统的版本下载安装
安装RStudio(推荐)
RStudio是R的集成开发环境,提供更好的用户体验。
- 访问Posit官网:https://posit.co/downloads/
- 选择对应系统的版本下载安装RStudio Desktop
基础设置
进入到RStudio后,新建一个R脚本或R笔记。在下方控制台输入代码进行一次性运算,或在脚本区输入多行代码进行复杂运算。同时,运行代码可以选择单行执行,或多行执行,或区块执行。
测试代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| version
setwd("D:/my_project")
getwd()
install.packages("package_name")
library(package_name)
|
3. 基础语法和数据类型
基本运算
1 2 3 4 5 6 7 8 9 10 11 12 13
| 1 + 2 5 - 3 3 * 4 10 / 2 2 ^ 3 5 %% 2
3 > 2 3 <= 2 5 == 5 5 != 4
|
变量赋值
1 2 3 4 5 6 7 8 9 10 11
| x <- 10 y = 20 name <- "张三"
print(x) x
rm(x)
|
数据类型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| num <- 3.14 class(num)
int <- 5L class(int)
char <- "Hello R" class(char)
bool <- TRUE class(bool)
comp <- 3 + 2i class(comp)
|
数据类型转换
1 2 3 4 5
| as.numeric("123") as.character(123) as.logical(1) as.integer(3.14)
|
4. 数据结构
向量(Vector)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| v1 <- c(1, 2, 3, 4, 5) v2 <- 1:5 v3 <- seq(1, 10, by = 2) v4 <- rep(1, 5)
v1 + v2 v1 * 2 sum(v1) mean(v1) length(v1)
v1[1] v1[2:4] v1[c(1, 3, 5)] v1[v1 > 3]
|
矩阵(Matrix)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| m1 <- matrix(1:9, nrow = 3, ncol = 3) m2 <- matrix(1:6, nrow = 2, byrow = TRUE)
m1 + m1 m1 * 2 t(m1) m1 %*% m1
m1[1, 2] m1[1, ] m1[, 2]
|
数据框(Data Frame)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| df <- data.frame( name = c("Alice", "Bob", "Charlie"), age = c(25, 30, 35), salary = c(50000, 60000, 70000), stringsAsFactors = FALSE )
str(df) summary(df) head(df) names(df)
df$name df[1, ] df[, 2] df[df$age > 25, ]
|
列表(List)
1 2 3 4 5 6 7 8 9 10 11 12
| my_list <- list( name = "John", scores = c(85, 92, 78), matrix = matrix(1:4, nrow = 2), details = list(age = 25, city = "Beijing") )
my_list$name my_list[[2]] my_list[["scores"]]
|
因子(Factor)
1 2 3 4 5 6 7 8 9
| gender <- factor(c("Male", "Female", "Male", "Female")) levels(gender) table(gender)
grade <- factor(c("A", "B", "C", "A"), levels = c("C", "B", "A"), ordered = TRUE)
|
5. 数据导入与导出
读取CSV文件
1 2 3 4 5 6 7 8 9 10 11
| data <- read.csv("data.csv")
data <- read.csv("data.csv", header = TRUE, stringsAsFactors = FALSE, na.strings = c("", "NA"))
write.csv(data, "output.csv", row.names = FALSE)
|
读取Excel文件
1 2 3 4 5 6 7 8 9
| library(readxl)
data <- read_excel("data.xlsx", sheet = 1)
library(openxlsx) write.xlsx(data, "output.xlsx")
|
其他格式
1 2 3 4 5 6 7 8 9 10
| data <- read.table("data.txt", header = TRUE)
library(foreign) data <- read.spss("data.sav", to.data.frame = TRUE)
save(data, file = "mydata.RData") load("mydata.RData")
|
6. 数据清洗与处理
处理缺失值
1 2 3 4 5 6 7 8
| is.na(data) sum(is.na(data)) colSums(is.na(data))
data_clean <- na.omit(data) data$column[is.na(data$column)] <- mean(data$column, na.rm = TRUE)
|
数据转换
1 2 3 4 5 6 7
| data$column <- as.numeric(data$column) data$category <- as.factor(data$category)
data$new_var <- data$var1 + data$var2 data$category <- ifelse(data$score > 60, "Pass", "Fail")
|
数据筛选和排序
1 2 3 4 5 6 7 8 9 10 11
| subset_data <- data[data$age > 25 & data$salary > 50000, ] sorted_data <- data[order(data$salary, decreasing = TRUE), ]
library(dplyr)
filtered_data <- data %>% filter(age > 25, salary > 50000) %>% arrange(desc(salary)) %>% select(name, age, salary)
|
数据聚合
1 2 3 4 5 6 7 8 9 10 11
| aggregate(salary ~ department, data = data, mean)
summary_data <- data %>% group_by(department, gender) %>% summarise( avg_salary = mean(salary), count = n(), max_age = max(age) )
|
7. 数据可视化
基础绘图
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| plot(data$age, data$salary, main = "年龄与工资关系", xlab = "年龄", ylab = "工资", col = "blue", pch = 16)
hist(data$salary, main = "工资分布", xlab = "工资", col = "lightblue")
boxplot(salary ~ department, data = data, main = "各部门工资分布", col = "lightgreen")
|
使用ggplot2包
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| library(ggplot2)
ggplot(data, aes(x = age, y = salary, color = department)) + geom_point() + labs(title = "年龄与工资关系", x = "年龄", y = "工资") + theme_minimal()
ggplot(data, aes(x = department, fill = gender)) + geom_bar(position = "dodge") + labs(title = "各部门性别分布")
time_series <- data.frame( month = 1:12, sales = c(100, 120, 130, 150, 140, 160, 170, 180, 190, 200, 210, 220) )
ggplot(time_series, aes(x = month, y = sales)) + geom_line(color = "blue", size = 1) + geom_point(color = "red", size = 2) + labs(title = "月度销售趋势")
|
8. 统计分析
描述性统计
1 2 3 4 5 6 7 8 9 10
| summary(data)
library(psych) describe(data)
cor(data$age, data$salary) cor_matrix <- cor(data[, c("age", "salary", "experience")])
|
假设检验
1 2 3 4 5 6 7 8 9
| t.test(salary ~ gender, data = data)
anova_result <- aov(salary ~ department, data = data) summary(anova_result)
chisq.test(data$department, data$gender)
|
回归分析
1 2 3 4 5 6 7 8 9 10 11 12
| model <- lm(salary ~ age + experience + education, data = data) summary(model)
logit_model <- glm(promoted ~ age + performance, data = data, family = binomial) summary(logit_model)
plot(model)
|
9. 函数编程
创建函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| calculate_bmi <- function(weight, height) { bmi <- weight / (height ^ 2) return(bmi) }
greet <- function(name = "朋友") { paste("你好,", name) }
summary_stats <- function(x) { list( mean = mean(x), sd = sd(x), n = length(x) ) }
|
控制结构
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| grade <- function(score) { if (score >= 90) { return("A") } else if (score >= 80) { return("B") } else if (score >= 70) { return("C") } else { return("F") } }
for (i in 1:5) { print(paste("这是第", i, "次循环")) }
count <- 1 while (count <= 5) { print(count) count <- count + 1 }
|
应用函数
1 2 3 4 5 6 7 8 9 10
| matrix_data <- matrix(1:12, nrow = 3)
apply(matrix_data, 1, sum) apply(matrix_data, 2, mean)
my_list <- list(a = 1:5, b = 6:10) lapply(my_list, mean) sapply(my_list, mean)
|
10. 高级应用
数据重塑
1 2 3 4 5 6 7 8 9 10 11 12 13
| library(tidyr)
long_data <- pivot_longer(data, cols = c("jan", "feb", "mar"), names_to = "month", values_to = "sales")
wide_data <- pivot_wider(long_data, names_from = "month", values_from = "sales")
|
字符串处理
1 2 3 4 5 6 7 8
| library(stringr)
str_length("Hello") str_to_upper("hello") str_replace("Hello World", "World", "R") str_split("a,b,c", ",") str_detect(c("apple", "banana"), "app")
|
日期处理
1 2 3 4 5 6 7 8 9 10 11 12 13
| library(lubridate)
today() ymd("2023-12-25") mdy("December 25, 2023") year(today()) month(today(), label = TRUE)
date1 <- ymd("2023-01-01") date2 <- ymd("2023-12-31") interval(date1, date2) / days(1)
|
性能优化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| system.time({ result <- sqrt(1:1000000) })
system.time({ result <- numeric(1000000) for (i in 1:1000000) { result[i] <- sqrt(i) } })
library(data.table) dt <- as.data.table(data) result <- dt[age > 25, .(avg_salary = mean(salary)), by = department]
|
创建报告
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
|
library(shiny)
ui <- fluidPage( titlePanel("我的Shiny应用"), sidebarLayout( sidebarPanel( sliderInput("bins", "分箱数:", min = 1, max = 50, value = 30) ), mainPanel( plotOutput("distPlot") ) ) )
server <- function(input, output) { output$distPlot <- renderPlot({ x <- faithful$waiting bins <- seq(min(x), max(x), length.out = input$bins + 1) hist(x, breaks = bins, col = 'darkgray', border = 'white') }) }
shinyApp(ui = ui, server = server)
|
学习资源推荐
在线资源
推荐书籍
- 《R数据科学》
- 《R语言实战》
- 《高级R语言编程指南》
练习平台
这个教程涵盖了R语言从基础到高级的主要知识点。建议按照顺序学习,并通过实际项目来巩固所学内容。R语言的学习需要大量的实践,多写代码、多解决问题是掌握这门语言的关键。