R

#15
TIOBE#14
PYPL#6
GitHub#34
RedMonk#12
IEEESpectrum#20
Programming LanguageStatistical ComputingData AnalysisMachine LearningData ScienceVisualization

Programming Language

R

Overview

R is a programming language specialized for statistical computing and data analysis.

Details

R is a programming language and environment for statistical computing and graphics developed by Ross Ihaka and Robert Gentleman in New Zealand in 1993. Starting as an implementation of the S language, it provides rich functionality specialized for statistical analysis, data mining, machine learning, and data visualization. CRAN (The Comprehensive R Archive Network) has over 20,000 packages registered, covering virtually every statistical and data analysis method. It is widely used in academic research, data science, bioinformatics, financial analysis, and other fields, making it an essential tool for statisticians and data scientists.

Code Examples

Hello World

# Basic output
print("Hello, World!")

# Output using cat function
cat("Hello, R!\n")

# Output using variables
message <- "Hello, R!"
print(message)

# Output multiple values
name <- "John"
age <- 25
cat("My name is", name, "and I am", age, "years old.\n")

# String concatenation using paste function
greeting <- paste("Hello,", "R", "Programming!")
print(greeting)

# Formatted output using sprintf
formatted_message <- sprintf("My name is %s and I am %d years old.", name, age)
print(formatted_message)

Variables and Data Types

# Numeric types
x <- 42
y <- 3.14159
scientific_notation <- 1.23e-4

# Character types
name <- "John Smith"
city <- 'New York'

# Logical types
is_active <- TRUE
is_complete <- FALSE

# Factor (categorical data)
gender <- factor(c("Male", "Female", "Male", "Female"))
print(levels(gender))

# Vectors (collection of elements of same type)
numbers <- c(1, 2, 3, 4, 5)
fruits <- c("Apple", "Banana", "Orange")
logicals <- c(TRUE, FALSE, TRUE)

# Sequence generation
sequence1 <- 1:10  # 1 to 10
sequence2 <- seq(0, 100, by=10)  # 0 to 100 by 10
repeated <- rep(c(1, 2), times=5)  # Repeat 1,2 five times

# Lists (can contain different types)
person <- list(
  name = "Jane Doe",
  age = 30,
  married = TRUE,
  children = c("Tom", "Alice")
)

# Data frame (tabular data)
df <- data.frame(
  Name = c("Smith", "Johnson", "Brown"),
  Age = c(25, 30, 35),
  Gender = c("Male", "Female", "Male"),
  stringsAsFactors = FALSE
)

# Matrix
matrix1 <- matrix(1:12, nrow=3, ncol=4)
matrix2 <- matrix(c(1,2,3,4,5,6), nrow=2, byrow=TRUE)

# Array (multi-dimensional)
array1 <- array(1:24, dim=c(2,3,4))

# Check data types
print(class(x))          # "numeric"
print(class(name))       # "character"  
print(class(is_active))  # "logical"
print(class(df))         # "data.frame"

# Data structure information
str(person)  # Structure of list
str(df)      # Structure of data frame
summary(df)  # Summary statistics

# Type conversion
number_as_char <- as.character(42)
char_as_number <- as.numeric("123")
number_as_logical <- as.logical(c(0, 1, 2))

print(paste("Number to character:", number_as_char))
print(paste("Character to number:", char_as_number))
print(paste("Number to logical:", number_as_logical))

Functions and Control Structures

# Basic function definition
add_numbers <- function(a, b) {
  result <- a + b
  return(result)
}

# Function with default arguments
greet <- function(name, greeting = "Hello") {
  message <- paste(greeting, name, "!")
  return(message)
}

# Variable arguments
calculate_stats <- function(...) {
  values <- c(...)
  list(
    mean = mean(values),
    median = median(values),
    sd = sd(values),
    min = min(values),
    max = max(values)
  )
}

# Conditional statements - if
age <- 20
if (age >= 18) {
  print("Adult")
} else {
  print("Minor")
}

# Multiple conditions
score <- 85
if (score >= 90) {
  grade <- "A"
} else if (score >= 80) {
  grade <- "B"
} else if (score >= 70) {
  grade <- "C"
} else {
  grade <- "D"
}
print(paste("Grade:", grade))

# ifelse function (vectorized conditional)
ages <- c(15, 22, 17, 30, 16)
status <- ifelse(ages >= 18, "Adult", "Minor")
print(status)

# switch statement
get_day_name <- function(day_number) {
  switch(day_number,
    "1" = "Monday",
    "2" = "Tuesday", 
    "3" = "Wednesday",
    "4" = "Thursday",
    "5" = "Friday",
    "6" = "Saturday",
    "7" = "Sunday",
    "Invalid day"
  )
}

# for loop
print("=== for loop example ===")
for (i in 1:5) {
  print(paste("Count:", i))
}

# for loop with vectors
fruits <- c("Apple", "Banana", "Orange")
for (fruit in fruits) {
  print(paste("Fruit:", fruit))
}

# while loop
count <- 1
while (count <= 3) {
  print(paste("while loop count:", count))
  count <- count + 1
}

# repeat loop (infinite loop)
counter <- 1
repeat {
  print(paste("repeat loop count:", counter))
  counter <- counter + 1
  if (counter > 3) {
    break
  }
}

# Function call examples
print(add_numbers(5, 3))
print(greet("Smith"))
print(greet("Johnson", "Good morning"))

stats <- calculate_stats(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
print(stats)

for (day in 1:7) {
  print(get_day_name(as.character(day)))
}

Data Manipulation and Analysis

# Create sample data
set.seed(123)  # Set random seed for reproducibility
n <- 100

# Create data frame
students <- data.frame(
  StudentID = 1:n,
  Name = paste0("Student", 1:n),
  Math = round(rnorm(n, mean=75, sd=15), 1),
  English = round(rnorm(n, mean=70, sd=12), 1),
  Science = round(rnorm(n, mean=78, sd=18), 1),
  Gender = factor(sample(c("Male", "Female"), n, replace=TRUE)),
  Grade = factor(sample(1:3, n, replace=TRUE)),
  stringsAsFactors = FALSE
)

# Basic data information
print("=== Basic Data Information ===")
print(head(students))          # First 6 rows
print(tail(students))          # Last 6 rows
print(dim(students))           # Dimensions
print(nrow(students))          # Number of rows
print(ncol(students))          # Number of columns
print(colnames(students))      # Column names
print(summary(students))       # Summary statistics

# Data selection and extraction
print("=== Data Selection ===")
# Select specific columns
math_scores <- students$Math
english_scores <- students[, "English"]
selected_columns <- students[, c("Name", "Math", "English")]

# Select specific rows
first_10_students <- students[1:10, ]
specific_student <- students[students$StudentID == 5, ]

# Conditional extraction
high_math_students <- students[students$Math >= 80, ]
male_students <- students[students$Gender == "Male", ]
excellent_students <- students[students$Math >= 80 & students$English >= 80, ]

print(paste("Students with Math >= 80:", nrow(high_math_students)))
print(paste("Male students:", nrow(male_students)))
print(paste("Students with Math & English >= 80:", nrow(excellent_students)))

# Data modification and addition
students$Total <- students$Math + students$English + students$Science
students$Average <- round(students$Total / 3, 1)

# Add grade evaluation
students$Evaluation <- cut(students$Average, 
                          breaks = c(0, 60, 70, 80, 90, 100),
                          labels = c("D", "C", "B", "A", "S"),
                          include.lowest = TRUE)

# Basic statistics calculation
print("=== Basic Statistics ===")
cat("Math mean:", mean(students$Math), "\n")
cat("Math median:", median(students$Math), "\n")
cat("Math standard deviation:", sd(students$Math), "\n")
cat("Math variance:", var(students$Math), "\n")
cat("Math minimum:", min(students$Math), "\n")
cat("Math maximum:", max(students$Math), "\n")

# Group-wise aggregation
print("=== Group-wise Aggregation ===")
# Average by gender
gender_avg <- aggregate(Average ~ Gender, data = students, FUN = mean)
print(gender_avg)

# Subject averages by grade
grade_avg <- aggregate(cbind(Math, English, Science) ~ Grade, data = students, FUN = mean)
print(grade_avg)

# Count by evaluation
evaluation_count <- table(students$Evaluation)
print(evaluation_count)

# Cross-tabulation of gender and grade
cross_table <- table(students$Gender, students$Grade)
print(cross_table)

# Correlation analysis
print("=== Correlation Analysis ===")
numeric_columns <- students[, c("Math", "English", "Science")]
correlation_matrix <- cor(numeric_columns)
print(correlation_matrix)

Data Visualization

# Basic plots
print("=== Basic Plots (Data Visualization) ===")

# Data preparation (using students data from previous section)
# Note: In actual R console, the following plot functions will display graphs

# Scatter plot
plot(students$Math, students$English, 
     main = "Math vs English Scatter Plot",
     xlab = "Math Score", 
     ylab = "English Score",
     col = "blue",
     pch = 16)

# Add regression line
abline(lm(English ~ Math, data = students), col = "red", lwd = 2)

# Histogram
hist(students$Math, 
     main = "Distribution of Math Scores",
     xlab = "Score", 
     ylab = "Frequency",
     col = "lightblue",
     breaks = 20)

# Box plot
boxplot(Math ~ Gender, data = students,
        main = "Math Score Distribution by Gender",
        xlab = "Gender",
        ylab = "Math Score",
        col = c("pink", "lightblue"))

# Bar plot
barplot(table(students$Evaluation),
        main = "Number of Students by Evaluation",
        xlab = "Evaluation",
        ylab = "Count",
        col = rainbow(5))

# Advanced visualization with ggplot2
# install.packages("ggplot2")  # Only needed once
library(ggplot2)

# Scatter plot (ggplot2)
p1 <- ggplot(students, aes(x = Math, y = English, color = Gender)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Math vs English Relationship by Gender",
       x = "Math Score", 
       y = "English Score") +
  theme_minimal()

# Histogram (ggplot2)
p2 <- ggplot(students, aes(x = Average, fill = Gender)) +
  geom_histogram(bins = 20, alpha = 0.7, position = "identity") +
  facet_wrap(~Gender) +
  labs(title = "Average Score Distribution by Gender",
       x = "Average Score",
       y = "Frequency") +
  theme_minimal()

# Box plot (ggplot2)
p3 <- ggplot(students, aes(x = Grade, y = Average, fill = Grade)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Average Score Distribution by Grade",
       x = "Grade",
       y = "Average Score") +
  theme_minimal()

# Note: To actually display graphs, run the following commands
# print(p1)
# print(p2) 
# print(p3)

print("Graph creation code executed. Graphs will be displayed in actual R console.")

Statistical Analysis

# Descriptive statistics
print("=== Descriptive Statistics ===")

# Detailed basic statistics
desc_stats <- function(x) {
  list(
    mean = mean(x, na.rm = TRUE),
    median = median(x, na.rm = TRUE),
    mode = as.numeric(names(sort(table(x), decreasing = TRUE))[1]),
    sd = sd(x, na.rm = TRUE),
    variance = var(x, na.rm = TRUE),
    minimum = min(x, na.rm = TRUE),
    maximum = max(x, na.rm = TRUE),
    q1 = quantile(x, 0.25, na.rm = TRUE),
    q3 = quantile(x, 0.75, na.rm = TRUE),
    skewness = moments::skewness(x, na.rm = TRUE),
    kurtosis = moments::kurtosis(x, na.rm = TRUE)
  )
}

# Descriptive statistics for Math
# install.packages("moments")  # Only needed once
library(moments)
math_stats <- desc_stats(students$Math)
print("Math descriptive statistics:")
print(math_stats)

# Hypothesis testing
print("=== Hypothesis Testing ===")

# 1. One-sample t-test (Is Math mean significantly different from 75?)
t_test_one <- t.test(students$Math, mu = 75)
print("One-sample t-test (Math mean vs 75):")
print(t_test_one)

# 2. Two-sample t-test (Is there difference in Math means between genders?)
male_math <- students[students$Gender == "Male", "Math"]
female_math <- students[students$Gender == "Female", "Math"]

t_test_two <- t.test(male_math, female_math)
print("Two-sample t-test (Gender difference in Math):")
print(t_test_two)

# 3. Paired t-test (Is there difference between Math and English means?)
paired_t_test <- t.test(students$Math, students$English, paired = TRUE)
print("Paired t-test (Math vs English):")
print(paired_t_test)

# 4. Chi-square test (Independence of Gender and Evaluation)
chi_square_test <- chisq.test(table(students$Gender, students$Evaluation))
print("Chi-square test (Gender and Evaluation independence):")
print(chi_square_test)

# 5. ANOVA (Difference in average scores between grades)
anova_result <- aov(Average ~ Grade, data = students)
print("ANOVA (Average scores between grades):")
print(summary(anova_result))

# Post-hoc test (Tukey HSD)
tukey_result <- TukeyHSD(anova_result)
print("Tukey HSD post-hoc test:")
print(tukey_result)

# Correlation tests
print("=== Correlation Analysis ===")

# Pearson correlation coefficient
cor_test_math_english <- cor.test(students$Math, students$English)
print("Math and English correlation test:")
print(cor_test_math_english)

# Spearman rank correlation
cor_test_spearman <- cor.test(students$Math, students$English, method = "spearman")
print("Math and English Spearman correlation:")
print(cor_test_spearman)

# Normality tests
print("=== Normality Tests ===")

# Shapiro-Wilk test
shapiro_math <- shapiro.test(students$Math)
print("Math normality test (Shapiro-Wilk):")
print(shapiro_math)

# Kolmogorov-Smirnov test
ks_math <- ks.test(students$Math, "pnorm", mean = mean(students$Math), sd = sd(students$Math))
print("Math normality test (Kolmogorov-Smirnov):")
print(ks_math)

# Regression analysis
print("=== Regression Analysis ===")

# Simple linear regression
simple_regression <- lm(English ~ Math, data = students)
print("Simple linear regression (English ~ Math):")
print(summary(simple_regression))

# Multiple linear regression
multiple_regression <- lm(Average ~ Math + English + Science, data = students)
print("Multiple linear regression (Average ~ Math + English + Science):")
print(summary(multiple_regression))

# Regression diagnostics
par(mfrow = c(2, 2))
plot(simple_regression)
par(mfrow = c(1, 1))

print("Statistical analysis completed.")

Package Management and Data I/O

# Package management
print("=== Package Management ===")

# Check installed packages
installed_packages <- installed.packages()[, c("Package", "Version")]
print(paste("Number of installed packages:", nrow(installed_packages)))

# Install packages (examples)
# install.packages("dplyr")
# install.packages("ggplot2")
# install.packages("readr")

# Load packages
library(utils)  # Base package

# Check loaded packages
search()  # Display loaded packages

# Data input/output
print("=== Data Input/Output ===")

# CSV file read/write
# Save data
write.csv(students, "students_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
print("Saved to CSV file: students_data.csv")

# Load data
# loaded_data <- read.csv("students_data.csv", fileEncoding = "UTF-8")
# print("Loaded from CSV file")

# Excel file read/write (using openxlsx package)
# install.packages("openxlsx")  # Only needed once
# library(openxlsx)
# write.xlsx(students, "students_data.xlsx")
# loaded_excel <- read.xlsx("students_data.xlsx")

# RData file (R native format)
save(students, file = "students_data.RData")
print("Saved to RData file: students_data.RData")

# load("students_data.RData")  # Load data

# Text file read/write
writeLines(c("This is a test file", "Second line of data", "Third line of data"), "test.txt")
text_data <- readLines("test.txt", encoding = "UTF-8")
print("Text file contents:")
print(text_data)

# JSON file read/write (jsonlite package)
# install.packages("jsonlite")  # Only needed once
# library(jsonlite)

# Convert list to JSON
sample_list <- list(
  name = "Sample Data",
  values = c(1, 2, 3, 4, 5),
  metadata = list(
    created = Sys.Date(),
    author = "R script"
  )
)

# json_string <- toJSON(sample_list, pretty = TRUE, auto_unbox = TRUE)
# writeLines(json_string, "sample_data.json")
# loaded_json <- fromJSON("sample_data.json")

# Load data from URL (example: CSV)
# url_data <- read.csv("https://example.com/data.csv")

# Database connection (example: SQLite)
# install.packages("RSQLite")  # Only needed once
# library(RSQLite)
# 
# # Connect to database
# con <- dbConnect(SQLite(), "sample.db")
# 
# # Write data to database
# dbWriteTable(con, "students", students, overwrite = TRUE)
# 
# # Get data with SQL query
# query_result <- dbGetQuery(con, "SELECT * FROM students WHERE Math > 80")
# 
# # Close connection
# dbDisconnect(con)

# Check and change working directory
print(paste("Current working directory:", getwd()))

# List files in directory
files_in_dir <- list.files()
print("Files in current directory:")
print(files_in_dir)

# Check file existence
print(paste("students_data.csv exists:", file.exists("students_data.csv")))
print(paste("students_data.RData exists:", file.exists("students_data.RData")))

# Check file size
if (file.exists("students_data.csv")) {
  file_size <- file.size("students_data.csv")
  print(paste("students_data.csv size:", file_size, "bytes"))
}

print("Package management and data I/O examples completed.")

Versions

Version Release Date Major Features
R 4.4 2024-04 Performance improvements, Enhanced object.size()
R 4.3 2023-04 Improved graphics engine, New shorthand syntax
R 4.2 2022-04 Graphics improvements, Enhanced pipe operator
R 4.1 2021-05 Native pipe operator, Improved graphics
R 4.0 2020-04 Reference counting, Raw strings
R 3.6 2019-04 Serialization improvements, Better random number generation

Reference Links

Official Documentation

Learning Resources

Packages and Tools

  • RStudio - R development environment IDE
  • Tidyverse - Collection of data science packages
  • Shiny - Interactive web app creation
  • ggplot2 - Data visualization package