R Tutorial
An introduction to R
Introduction
This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .
Download
- Download at https://www.r-project.org/
- Download
RStudio
at https://rstudio.com/products/rstudio/download/
Calculator
can be used as a super awesome calculator
# 5 + 3 = 8
5 + 3
## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2)
## [1] 8
# 2 * 2 * 2 = 8
2^3
## [1] 8
# 8 * 8 = 64
sqrt(64)
## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000)
## [1] 8
Functions
has many useful built in functions
1:10
## [1] 1 2 3 4 5 6 7 8 9 10
as.character(1:10)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rep(1:2, times = 5)
## [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)
## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)
## [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)
## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)
## [1] 5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)
## [1] 5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")
## [1] "1-20" "2-21" "3-22" "4-23" "5-24" "6-25" "7-26" "8-27" "9-28" "10-29" "1-30"
paste(1:10, collapse = "-")
## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
min(1:10)
## [1] 1
max(1:10)
## [1] 10
range(1:10)
## [1] 1 10
mean(1:10)
## [1] 5.5
sd(1:10)
## [1] 3.02765
Custom Functions
Users can also create their own functions
<- function(x, y) {
customFunction1 <- 100 * x / (x + y)
z paste(z, "%")
}customFunction1(x = 10, y = 90)
## [1] "10 %"
<- function(x) {
customFunction2 <- mean(x - sd(x))
mymin <- mean(x) + sd(x)
mymax print(paste("Min =", mymin))
print(paste("Max =", mymax))
}customFunction2(x = 1:10)
## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"
for
loops and if
else
statements
<- NULL #creates and empty object
xx for(i in 1:10) {
<- i*3
xx[i]
} xx
## [1] 3 6 9 12 15 18 21 24 27 30
%% 2 #gives the remainder when divided by 2 xx
## [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
if((xx[i] %% 2) == 0) {
print(paste(xx[i],"is Even"))
else {
} print(paste(xx[i],"is Odd"))
} }
## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")
## [1] "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))
## [1] "3 is Odd" "6 is Even" "9 is Odd" "12 is Even" "15 is Odd" "18 is Even" "21 is Odd" "24 is Even" "27 is Odd" "30 is Even"
Objects
Information can be stored in user defined objects, in multiple forms:
c()
: a string of valuesmatrix()
: a two dimensional matrix in one formatdata.frame()
: a two dimensional matrix where each column can be a different formatlist()
:
A string…
<- 1:10
xc xc
## [1] 1 2 3 4 5 6 7 8 9 10
<- c(1,2,3,4,5,6,7,8,9,10)
xc xc
## [1] 1 2 3 4 5 6 7 8 9 10
A matrix…
<- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm xm
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
<- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm xm
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
A data frame…
<- data.frame(
xd x1 = c("aa","bb","cc","dd","ee",
"ff","gg","hh","ii","jj"),
x2 = 1:10,
x3 = c(1,1,1,1,1,2,2,2,3,3),
x4 = rep(c(1,2), times = 5),
x5 = rep(1:5, times = 2),
x6 = rep(1:5, each = 2),
x7 = seq(5, 50, by = 5),
x8 = log10(1:10),
x9 = (1:10)^3,
x10 = c(T,T,T,F,F,T,T,F,F,F)
) xd
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
A list…
<- list(xc, xm, xd)
xl 1]] xl[[
## [1] 1 2 3 4 5 6 7 8 9 10
2]] xl[[
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
3]] xl[[
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
Selecting Data
5] # 5th element in xc xc[
## [1] 5
$x3[5] # 5th element in col "x3" xd
## [1] 1
5,"x3"] # row 5, col "x3" xd[
## [1] 1
$x3 # all of col "x3" xd
## [1] 1 1 1 1 1 2 2 2 3 3
"x3"] # all rows, col "x3" xd[,
## [1] 1 1 1 1 1 2 2 2 3 3
3,] # row 3, all cols xd[
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5" xd[
## x4 x5
## 2 2 2
## 4 2 4
3]]$x1 # 3rd object in the list, col "x1 xl[[
## [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
regexpr
<- data.frame(Name = c("Item 1 (detail 1)",
xx "Item 20 (detail 20)",
"Item 300 (detail 300)"),
Item = NA,
Detail = NA)
$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx xx
## Name Item Detail
## 1 Item 1 (detail 1) Item 1 detail 1
## 2 Item 20 (detail 20) Item 20 detail 20
## 3 Item 300 (detail 300) Item 300 detail 300
Data Formats
Data can also be saved in many formats:
- numeric
- integer
- character
- factor
- logical
$x3 <- as.character(xd$x3)
xd$x3 xd
## [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
$x3 <- as.numeric(xd$x3)
xd$x3 xd
## [1] 1 1 1 1 1 2 2 2 3 3
$x3 <- as.factor(xd$x3)
xd$x3 xd
## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3 xd
## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
$x10 xd
## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0
## [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)
## [1] 5
Internal structure of an object can be checked with
str()
str(xc) # c()
## num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()
## int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()
## 'data.frame': 10 obs. of 10 variables:
## $ x1 : chr "aa" "bb" "cc" "dd" ...
## $ x2 : int 1 2 3 4 5 6 7 8 9 10
## $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
## $ x4 : num 1 2 1 2 1 2 1 2 1 2
## $ x5 : int 1 2 3 4 5 1 2 3 4 5
## $ x6 : int 1 1 2 2 3 3 4 4 5 5
## $ x7 : num 5 10 15 20 25 30 35 40 45 50
## $ x8 : num 0 0.301 0.477 0.602 0.699 ...
## $ x9 : num 1 8 27 64 125 216 343 512 729 1000
## $ x10: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()
## List of 3
## $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
## $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 10 obs. of 10 variables:
## ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
## ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
## ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
## ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
## ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
## ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
## ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
## ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
## ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...
Packages
Additional libraries can be installed and loaded for use.
install.packages("scales")
library(scales)
<- data.frame(Values = 1:10)
xx $Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx xx
## Values Rescaled
## 1 1 1.000000
## 2 2 4.222222
## 3 3 7.444444
## 4 4 10.666667
## 5 5 13.888889
## 6 6 17.111111
## 7 7 20.333333
## 8 8 23.555556
## 9 9 26.777778
## 10 10 30.000000
libraries can also be used without having to load them
::rescale(1:10, to = c(1,30)) scales
## [1] 1.000000 4.222222 7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000
Data Wrangling
R for Data Science - https://r4ds.had.co.nz/
<- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
xx Data1 = 1:10,
Data2 = seq(10, 100, by = 10))
$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx xx
## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 X 6 60 66 6000
## 7 X 7 70 77 7000
## 8 X 8 80 88 8000
## 9 Y 9 90 99 9000
## 10 Y 10 100 110 10000
$Data1 < 5 # which are less than 5 xx
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
$Data1 < 5,] xx[xx
## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
$Group == "X", c("Group","Data2","NewData1")] xx[xx
## Group Data2 NewData1
## 1 X 10 11
## 2 X 20 22
## 6 X 60 66
## 7 X 70 77
## 8 X 80 88
Data wrangling with tidyverse
and pipes
(%>%
)
library(tidyverse) # install.packages("tidyverse")
<- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
xx mutate(Data1 = 1:10,
Data2 = seq(10, 100, by = 10),
NewData1 = Data1 + Data2,
NewData2 = Data1 * 1000)
xx
## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 Y 6 60 66 6000
## 7 Y 7 70 77 7000
## 8 X 8 80 88 8000
## 9 X 9 90 99 9000
## 10 X 10 100 110 10000
filter(xx, Data1 < 5)
## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
%>% filter(Data1 < 5) xx
## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
%>% filter(Group == "X") %>%
xx select(Group, NewColName=Data2, NewData1)
## Group NewColName NewData1
## 1 X 10 11
## 2 X 20 22
## 3 X 80 88
## 4 X 90 99
## 5 X 100 110
<- xx %>%
xs group_by(Group) %>%
summarise(Data2_mean = mean(Data2),
Data2_sd = sd(Data2),
NewData2_mean = mean(NewData2),
NewData2_sd = sd(NewData2))
xs
## # A tibble: 2 × 5
## Group Data2_mean Data2_sd NewData2_mean NewData2_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X 60 41.8 6000 4183.
## 2 Y 50 15.8 5000 1581.
%>% left_join(xs, by = "Group") xx
## Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1 X 1 10 11 1000 60 41.83300 6000 4183.300
## 2 X 2 20 22 2000 60 41.83300 6000 4183.300
## 3 Y 3 30 33 3000 50 15.81139 5000 1581.139
## 4 Y 4 40 44 4000 50 15.81139 5000 1581.139
## 5 Y 5 50 55 5000 50 15.81139 5000 1581.139
## 6 Y 6 60 66 6000 50 15.81139 5000 1581.139
## 7 Y 7 70 77 7000 50 15.81139 5000 1581.139
## 8 X 8 80 88 8000 60 41.83300 6000 4183.300
## 9 X 9 90 99 9000 60 41.83300 6000 4183.300
## 10 X 10 100 110 10000 60 41.83300 6000 4183.300
Read/Write data
<- read.csv("data_r_tutorial.csv")
xx write.csv(xx, "data_r_tutorial.csv", row.names = F)
For excel sheets, the package readxl
can be used to read
in sheets of data.
library(readxl) # install.packages("readxl")
<- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") xx
Tidy Data
- Tutorial 1 - https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
- Tutorial 2 - https://r4ds.had.co.nz/tidy-data.html
<- xx %>%
yy group_by(Name, Location) %>%
summarise(Mean_DTF = round(mean(DTF),1)) %>%
arrange(Location)
yy
## # A tibble: 9 × 3
## # Groups: Name [3]
## Name Location Mean_DTF
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
<- yy %>% spread(key = Location, value = Mean_DTF)
yy yy
## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
## <chr> <dbl> <dbl> <dbl>
## 1 CDC Maxim AGL 86.7 134. 52.5
## 2 ILL 618 AGL 79.3 138. 47
## 3 Laird AGL 76.8 137. 56.8
<- yy %>% gather(key = TraitName, value = Value, 2:4)
yy yy
## # A tibble: 9 × 3
## # Groups: Name [3]
## Name TraitName Value
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
<- yy %>% spread(key = Name, value = Value)
yy yy
## # A tibble: 3 × 4
## TraitName `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
## <chr> <dbl> <dbl> <dbl>
## 1 Jessore, Bangladesh 86.7 79.3 76.8
## 2 Metaponto, Italy 134. 138. 137.
## 3 Saskatoon, Canada 52.5 47 56.8
Base Plotting
We will start with some basic plotting using the base function
plot()
# A basic scatter plot
plot(x = xd$x8, y = xd$x9)
# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)
plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)
# Adjust plot type
plot(x = xd$x8, y = xd$x9, type = "line")
# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)
# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")
Now lets create some random and normally distributed data to make some more complicated plots
# 100 random uniformly distributed numbers ranging from 0 - 100
<- runif(100, min = 0, max = 100)
ru ru
## [1] 36.713370 12.708927 64.593882 60.016758 6.018877 22.514402 14.497875 80.617892 30.089694 29.445550 69.021319 94.508453 99.166544
## [14] 99.718593 31.955360 84.382868 78.771562 76.975368 8.563842 89.050655 44.322733 43.907508 47.126816 6.773618 2.366291 38.850588
## [27] 21.292387 54.753323 26.024200 40.746421 6.273575 12.405929 90.074422 20.024734 66.412224 78.574282 69.218055 25.282017 92.036508
## [40] 28.835408 52.141443 74.808394 27.674051 25.454106 20.340752 30.569162 9.996933 34.195606 26.219619 21.641569 46.115057 59.185439
## [53] 1.687442 77.523809 15.539648 47.961384 84.037762 80.855153 12.256730 59.690272 4.600601 90.210478 10.655451 94.972127 23.624386
## [66] 60.882676 98.253052 82.754852 60.145701 67.211364 63.927229 74.540320 5.001009 5.839298 34.838936 99.802504 62.968561 83.655646
## [79] 60.450237 16.712156 66.580458 26.312234 98.031754 17.886389 50.148150 8.315372 91.181195 94.594669 36.671397 41.898868 7.594261
## [92] 38.975553 90.224314 11.174149 20.461812 86.055434 47.274666 86.911898 46.977524 37.380648
plot(x = ru)
order(ru)
## [1] 53 25 61 73 74 5 31 24 91 86 19 47 63 94 59 32 2 7 55 80 84 34 45 95 27 50 6 65 38 44 29 49
## [33] 82 43 40 10 9 46 15 48 75 89 1 100 26 92 30 90 22 21 51 99 23 97 56 85 41 28 52 60 4 69 79 66
## [65] 77 71 3 35 81 70 11 37 72 42 18 54 36 17 8 58 68 78 57 16 96 98 20 33 62 93 87 39 12 88 64 83
## [97] 67 13 14 76
<- ru[order(ru)]
ru ru
## [1] 1.687442 2.366291 4.600601 5.001009 5.839298 6.018877 6.273575 6.773618 7.594261 8.315372 8.563842 9.996933 10.655451
## [14] 11.174149 12.256730 12.405929 12.708927 14.497875 15.539648 16.712156 17.886389 20.024734 20.340752 20.461812 21.292387 21.641569
## [27] 22.514402 23.624386 25.282017 25.454106 26.024200 26.219619 26.312234 27.674051 28.835408 29.445550 30.089694 30.569162 31.955360
## [40] 34.195606 34.838936 36.671397 36.713370 37.380648 38.850588 38.975553 40.746421 41.898868 43.907508 44.322733 46.115057 46.977524
## [53] 47.126816 47.274666 47.961384 50.148150 52.141443 54.753323 59.185439 59.690272 60.016758 60.145701 60.450237 60.882676 62.968561
## [66] 63.927229 64.593882 66.412224 66.580458 67.211364 69.021319 69.218055 74.540320 74.808394 76.975368 77.523809 78.574282 78.771562
## [79] 80.617892 80.855153 82.754852 83.655646 84.037762 84.382868 86.055434 86.911898 89.050655 90.074422 90.210478 90.224314 91.181195
## [92] 92.036508 94.508453 94.594669 94.972127 98.031754 98.253052 99.166544 99.718593 99.802504
plot(x = ru)
# 100 normally distributed numbers with a mean of 50 and sd of 10
<- rnorm(100, mean = 50, sd = 10)
nd nd
## [1] 44.91661 63.67980 57.50962 51.83689 49.25579 59.71923 32.27452 74.30934 46.98457 58.74374 47.92591 43.07540 47.31120 51.73675
## [15] 48.86188 46.98587 21.56761 58.86579 41.03213 51.47155 42.92441 48.08652 35.86348 47.11022 45.11191 47.14221 72.79779 48.74447
## [29] 34.86390 50.81252 53.89033 58.79210 50.25232 32.12140 57.68880 41.17402 39.36577 53.45131 57.25047 42.95447 53.82031 44.03344
## [43] 49.99686 48.66700 49.91018 47.46088 58.73114 41.44890 59.52578 49.05605 48.27219 54.24526 52.44277 49.61997 69.05433 62.92324
## [57] 36.08792 41.68678 48.46883 42.87258 40.43297 38.95474 49.58654 63.82488 47.80932 52.46844 40.29214 48.79846 57.69669 49.46121
## [71] 51.99955 45.34316 43.00138 47.56617 52.19814 47.57256 55.95786 44.38301 40.17620 41.70377 49.28846 49.50620 39.26434 69.55933
## [85] 52.03961 53.93256 57.82432 58.23487 30.02380 61.37119 63.68746 79.19164 57.48677 57.30516 53.05346 60.22641 50.07753 50.00611
## [99] 56.43118 41.15529
<- nd[order(nd)]
nd nd
## [1] 21.56761 30.02380 32.12140 32.27452 34.86390 35.86348 36.08792 38.95474 39.26434 39.36577 40.17620 40.29214 40.43297 41.03213
## [15] 41.15529 41.17402 41.44890 41.68678 41.70377 42.87258 42.92441 42.95447 43.00138 43.07540 44.03344 44.38301 44.91661 45.11191
## [29] 45.34316 46.98457 46.98587 47.11022 47.14221 47.31120 47.46088 47.56617 47.57256 47.80932 47.92591 48.08652 48.27219 48.46883
## [43] 48.66700 48.74447 48.79846 48.86188 49.05605 49.25579 49.28846 49.46121 49.50620 49.58654 49.61997 49.91018 49.99686 50.00611
## [57] 50.07753 50.25232 50.81252 51.47155 51.73675 51.83689 51.99955 52.03961 52.19814 52.44277 52.46844 53.05346 53.45131 53.82031
## [71] 53.89033 53.93256 54.24526 55.95786 56.43118 57.25047 57.30516 57.48677 57.50962 57.68880 57.69669 57.82432 58.23487 58.73114
## [85] 58.74374 58.79210 58.86579 59.52578 59.71923 60.22641 61.37119 62.92324 63.67980 63.68746 63.82488 69.05433 69.55933 72.79779
## [99] 74.30934 79.19164
plot(x = nd)
hist(x = nd)
hist(nd, breaks = 20, col = "darkgreen")
plot(x = density(nd))
boxplot(x = nd)
boxplot(x = nd, horizontal = T)
ggplot2
Lets be honest, the base plots are ugly! The ggplot2
package gives the user to create a better, more visually appealing
plots. Additional packages such as ggbeeswarm
and
ggrepel
also contain useful functions to add to the
functionality of ggplot2
.
- ggplot2 - https://ggplot2.tidyverse.org/
- Tutorial 1 - http://r-statistics.co/ggplot2-Tutorial-With-R.html
- Tutorial 2 - https://www.statsandr.com/blog/graphics-in-r-with-ggplot2/
- The R Graph Gallery - https://www.r-graph-gallery.com/ggplot2-package.html
library(ggplot2)
<- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point() mp
+ geom_point(aes(color = x3, shape = x3), size = 4) mp
+ geom_line(size = 2) mp
+ geom_line(aes(color = x3), size = 2) mp
+ geom_smooth(method = "loess") mp
+ geom_smooth(method = "lm") mp
<- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
xx rnorm(50, mean = 60, sd = 5)),
group = factor(rep(1:2, each = 50)),
label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
<- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black") mp
+ geom_histogram(color = "black", position = "dodge") mp
<- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1 mp1
+ geom_density(alpha = 0.5) mp
<- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black") mp
+ geom_boxplot() + geom_point() mp
+ geom_violin() + geom_boxplot(width = 0.1, fill = "white") mp
library(ggbeeswarm)
+ geom_quasirandom() mp
+ geom_quasirandom(aes(shape = group)) mp
<- mp + geom_violin() +
mp2 geom_boxplot(width = 0.1, fill = "white") +
geom_beeswarm(alpha = 0.5)
library(ggrepel)
+ geom_text_repel(aes(label = label), nudge_x = 0.4) mp2
library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
common.legend = T, legend = "bottom")
Statistics
- Handbook of Biological Statistics - http://biostathandbook.com/
- R Companion for ^ - https://rcompanion.org/rcompanion/a_02.html
# Prep data
<- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Loc <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
lev_Name <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
dd mutate(Location = factor(Location, levels = lev_Loc),
Name = factor(Name, levels = lev_Name))
<- dd %>%
xx group_by(Name, Location) %>%
summarise(Mean_DTF = mean(DTF))
%>% spread(Location, Mean_DTF) xx
## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
## <fct> <dbl> <dbl> <dbl>
## 1 ILL 618 AGL 47 79.3 138.
## 2 CDC Maxim AGL 52.5 86.7 134.
## 3 Laird AGL 56.8 76.8 137.
# Plot
<- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
mp1 geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
<- ggplot(xx, aes(x = Location, y = Mean_DTF,
mp2 color = Name, group = Name, shape = Name)) +
geom_point(size = 2.5, alpha = 0.7) +
geom_line(size = 1, alpha = 0.7) +
theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")
From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.
summary(aov(DTF ~ Name * Location, data = dd))
## Df Sum Sq Mean Sq F value Pr(>F)
## Name 2 88 44 3.476 0.0395 *
## Location 2 65863 32931 2598.336 < 2e-16 ***
## Name:Location 4 560 140 11.044 2.52e-06 ***
## Residuals 45 570 13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.
If we only have two things to compare, we could do a t-test.
<- dd %>%
xx filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)
##
## Welch Two Sample t-test
##
## data: xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -32.18265 -25.48402
## sample estimates:
## mean of x mean of y
## 52.11111 80.94444
DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.
<- dd %>%
xx filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
== "Metaponto, Italy") %>%
Location spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)
##
## Welch Two Sample t-test
##
## data: xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.059739 7.059739
## sample estimates:
## mean of x mean of y
## 137.8333 136.8333
DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.
pch Plot
<- data.frame(x = rep(1:6, times = 5, length.out = 26),
xx y = rep(5:1, each = 6, length.out = 26),
pch = 0:25)
<- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
mp geom_point(color = "darkred", fill = "darkblue", size = 5) +
geom_text(aes(label = pch), nudge_x = -0.25) +
scale_shape_manual(values = xx$pch) +
scale_x_continuous(breaks = 6:1) +
scale_y_continuous(breaks = 6:1) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Plot symbols in R (pch)",
subtitle = "color = \"darkred\", fill = \"darkblue\"",
x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")
R Markdown
Tutorials on how to create an R markdown document like this one can be found here: