dblogr/

R Tutorial

An introduction to R


Introduction

This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .

R Project Logo
R Project Logo

Calculator

can be used as a super awesome calculator

# 5 + 3 = 8
5 + 3 
## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) 
## [1] 8
# 2 * 2 * 2 = 8
2^3 
## [1] 8
# 8 * 8 = 64
sqrt(64) 
## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) 
## [1] 8

Functions

has many useful built in functions

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
as.character(1:10)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rep(1:2, times = 5)
##  [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)
##  [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)
##  [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)
## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)
## [1]  5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")
##  [1] "1-20"  "2-21"  "3-22"  "4-23"  "5-24"  "6-25"  "7-26"  "8-27"  "9-28"  "10-29" "1-30"
paste(1:10, collapse = "-")
## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)
##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10"
min(1:10)
## [1] 1
max(1:10)
## [1] 10
range(1:10)
## [1]  1 10
mean(1:10)
## [1] 5.5
sd(1:10)
## [1] 3.02765

Custom Functions

Users can also create their own functions

customFunction1 <- function(x, y) {
  z <- 100 * x / (x + y)
  paste(z, "%")
}
customFunction1(x = 10, y = 90)
## [1] "10 %"
customFunction2 <- function(x) {
  mymin <- mean(x - sd(x))
  mymax <- mean(x) + sd(x)
  print(paste("Min =", mymin))
  print(paste("Max =", mymax))
}
customFunction2(x = 1:10)
## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"

for loops and if else statements

xx <- NULL #creates and empty object
for(i in 1:10) {
  xx[i] <- i*3
}
xx
##  [1]  3  6  9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2
##  [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
  if((xx[i] %% 2) == 0) {
    print(paste(xx[i],"is Even"))
  } else { 
      print(paste(xx[i],"is Odd")) 
    }
}
## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")
##  [1] "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))
##  [1] "3 is Odd"   "6 is Even"  "9 is Odd"   "12 is Even" "15 is Odd"  "18 is Even" "21 is Odd"  "24 is Even" "27 is Odd" 
## [10] "30 is Even"

Objects

Information can be stored in user defined objects, in multiple forms:

  • c(): a string of values
  • matrix(): a two dimensional matrix in one format
  • data.frame(): a two dimensional matrix where each column can be a different format
  • list():

A string…

xc <- 1:10
xc
##  [1]  1  2  3  4  5  6  7  8  9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc
##  [1]  1  2  3  4  5  6  7  8  9 10

A matrix…

xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100

A data frame…

xd <- data.frame(
  x1 = c("aa","bb","cc","dd","ee",
         "ff","gg","hh","ii","jj"),
  x2 = 1:10,
  x3 = c(1,1,1,1,1,2,2,2,3,3),
  x4 = rep(c(1,2), times = 5),
  x5 = rep(1:5, times = 2),
  x6 = rep(1:5, each = 2),
  x7 = seq(5, 50, by = 5),
  x8 = log10(1:10),
  x9 = (1:10)^3,
  x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

A list…

xl <- list(xc, xm, xd)
xl[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
xl[[2]]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100
xl[[3]]
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

Selecting Data

xc[5] # 5th element in xc
## [1] 5
xd$x3[5] # 5th element in col "x3"
## [1] 1
xd[5,"x3"] # row 5, col "x3"
## [1] 1
xd$x3 # all of col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols
##   x1 x2 x3 x4 x5 x6 x7        x8 x9  x10
## 3 cc  3  1  1  3  2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"
##   x4 x5
## 2  2  2
## 4  2  4
xl[[3]]$x1 # 3rd object in the list, col "x1
##  [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"

regexpr

xx <- data.frame(Name = c("Item 1 (detail 1)",
                          "Item 20 (detail 20)",
                          "Item 300 (detail 300)"),
                 Item = NA,
                 Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx
##                    Name     Item     Detail
## 1     Item 1 (detail 1)   Item 1   detail 1
## 2   Item 20 (detail 20)  Item 20  detail 20
## 3 Item 300 (detail 300) Item 300 detail 300

Data Formats

Data can also be saved in many formats:

  • numeric
  • integer
  • character
  • factor
  • logical
xd$x3 <- as.character(xd$x3)
xd$x3
##  [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10
##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0
##  [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)
## [1] 5

Internal structure of an object can be checked with str()

str(xc) # c()
##  num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()
##  int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()
## 'data.frame':    10 obs. of  10 variables:
##  $ x1 : chr  "aa" "bb" "cc" "dd" ...
##  $ x2 : int  1 2 3 4 5 6 7 8 9 10
##  $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
##  $ x4 : num  1 2 1 2 1 2 1 2 1 2
##  $ x5 : int  1 2 3 4 5 1 2 3 4 5
##  $ x6 : int  1 1 2 2 3 3 4 4 5 5
##  $ x7 : num  5 10 15 20 25 30 35 40 45 50
##  $ x8 : num  0 0.301 0.477 0.602 0.699 ...
##  $ x9 : num  1 8 27 64 125 216 343 512 729 1000
##  $ x10: logi  TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()
## List of 3
##  $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
##  $ :'data.frame':    10 obs. of  10 variables:
##   ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
##   ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
##   ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
##   ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
##   ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
##   ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
##   ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
##   ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
##   ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...

Packages

Additional libraries can be installed and loaded for use.

install.packages("scales")
library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx
##    Values  Rescaled
## 1       1  1.000000
## 2       2  4.222222
## 3       3  7.444444
## 4       4 10.666667
## 5       5 13.888889
## 6       6 17.111111
## 7       7 20.333333
## 8       8 23.555556
## 9       9 26.777778
## 10     10 30.000000

libraries can also be used without having to load them

scales::rescale(1:10, to = c(1,30))
##  [1]  1.000000  4.222222  7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000

Data Wrangling

R for Data Science - https://r4ds.had.co.nz/

xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
                 Data1 = 1:10, 
                 Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      X     6    60       66     6000
## 7      X     7    70       77     7000
## 8      X     8    80       88     8000
## 9      Y     9    90       99     9000
## 10     Y    10   100      110    10000
xx$Data1 < 5 # which are less than 5
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]
##   Group Data2 NewData1
## 1     X    10       11
## 2     X    20       22
## 6     X    60       66
## 7     X    70       77
## 8     X    80       88

Data wrangling with tidyverse and pipes (%>%)

library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
  mutate(Data1 = 1:10, 
         Data2 = seq(10, 100, by = 10),
         NewData1 = Data1 + Data2,
         NewData2 = Data1 * 1000)
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      Y     6    60       66     6000
## 7      Y     7    70       77     7000
## 8      X     8    80       88     8000
## 9      X     9    90       99     9000
## 10     X    10   100      110    10000
filter(xx, Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Group == "X") %>% 
  select(Group, NewColName=Data2, NewData1)
##   Group NewColName NewData1
## 1     X         10       11
## 2     X         20       22
## 3     X         80       88
## 4     X         90       99
## 5     X        100      110
xs <- xx %>% 
  group_by(Group) %>% 
  summarise(Data2_mean = mean(Data2),
            Data2_sd = sd(Data2),
            NewData2_mean = mean(NewData2),
            NewData2_sd = sd(NewData2))
xs
## # A tibble: 2 × 5
##   Group Data2_mean Data2_sd NewData2_mean NewData2_sd
##   <chr>      <dbl>    <dbl>         <dbl>       <dbl>
## 1 X             60     41.8          6000       4183.
## 2 Y             50     15.8          5000       1581.
xx %>% left_join(xs, by = "Group")
##    Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1      X     1    10       11     1000         60 41.83300          6000    4183.300
## 2      X     2    20       22     2000         60 41.83300          6000    4183.300
## 3      Y     3    30       33     3000         50 15.81139          5000    1581.139
## 4      Y     4    40       44     4000         50 15.81139          5000    1581.139
## 5      Y     5    50       55     5000         50 15.81139          5000    1581.139
## 6      Y     6    60       66     6000         50 15.81139          5000    1581.139
## 7      Y     7    70       77     7000         50 15.81139          5000    1581.139
## 8      X     8    80       88     8000         60 41.83300          6000    4183.300
## 9      X     9    90       99     9000         60 41.83300          6000    4183.300
## 10     X    10   100      110    10000         60 41.83300          6000    4183.300

Read/Write data

xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)

For excel sheets, the package readxl can be used to read in sheets of data.

library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")

Tidy Data

yy <- xx %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = round(mean(DTF),1)) %>% 
  arrange(Location)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          Location            Mean_DTF
##   <chr>         <chr>                  <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh     86.7
## 2 ILL 618 AGL   Jessore, Bangladesh     79.3
## 3 Laird AGL     Jessore, Bangladesh     76.8
## 4 CDC Maxim AGL Metaponto, Italy       134. 
## 5 ILL 618 AGL   Metaponto, Italy       138. 
## 6 Laird AGL     Metaponto, Italy       137. 
## 7 CDC Maxim AGL Saskatoon, Canada       52.5
## 8 ILL 618 AGL   Saskatoon, Canada       47  
## 9 Laird AGL     Saskatoon, Canada       56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
##   <chr>                         <dbl>              <dbl>               <dbl>
## 1 CDC Maxim AGL                  86.7               134.                52.5
## 2 ILL 618 AGL                    79.3               138.                47  
## 3 Laird AGL                      76.8               137.                56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          TraitName           Value
##   <chr>         <chr>               <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh  86.7
## 2 ILL 618 AGL   Jessore, Bangladesh  79.3
## 3 Laird AGL     Jessore, Bangladesh  76.8
## 4 CDC Maxim AGL Metaponto, Italy    134. 
## 5 ILL 618 AGL   Metaponto, Italy    138. 
## 6 Laird AGL     Metaponto, Italy    137. 
## 7 CDC Maxim AGL Saskatoon, Canada    52.5
## 8 ILL 618 AGL   Saskatoon, Canada    47  
## 9 Laird AGL     Saskatoon, Canada    56.8
yy <- yy %>% spread(key = Name, value = Value)
yy
## # A tibble: 3 × 4
##   TraitName           `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
##   <chr>                         <dbl>         <dbl>       <dbl>
## 1 Jessore, Bangladesh            86.7          79.3        76.8
## 2 Metaponto, Italy              134.          138.        137. 
## 3 Saskatoon, Canada              52.5          47          56.8

Base Plotting

We will start with some basic plotting using the base function plot()

# A basic scatter plot
plot(x = xd$x8, y = xd$x9)

# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)

plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)

# Adjust plot type 
plot(x = xd$x8, y = xd$x9, type = "line")

# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)

# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")

Now lets create some random and normally distributed data to make some more complicated plots

# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru
##   [1] 30.6611610 37.7581528 44.6369519 92.2754464 30.1273778 83.3922241 31.3836700 20.3099027 78.3595440 26.3225375
##  [11] 94.9645460 31.7579420 92.2288819 28.0058549 32.9899976 63.8690058 11.5976153 49.5974155 21.7748875 96.9399880
##  [21] 33.3648912 74.1665106 29.7167699 28.8683205 72.0192496 82.8169597 62.7604530 71.2424416 51.8882331 59.0100412
##  [31] 31.7534832 76.0787503 37.7000552 21.2366523 83.4972339 20.8601917 72.0566853 93.7418408 85.3899123  4.7170781
##  [41] 89.1204730  4.3780108 93.9815697 23.0411222 44.5718199 85.7918534 25.1936670 77.0035673  2.0012858 44.2895074
##  [51] 20.5014186 94.2575307 12.2359721  8.1484995 67.5579701 95.3943237 91.3148499 26.2643423 99.2416064 11.5258160
##  [61] 38.2328979 47.8353369  0.3093203 50.6883806 33.3141798  8.2922413 76.6554664 31.7425363  9.4769845 92.5003056
##  [71] 54.1254316 46.3039483 65.6800678 82.5013874 36.2226413 10.8434889 72.7475125 95.1926798 62.9788998 69.1037814
##  [81] 72.2492234  7.2369264 59.2928890 93.9160067 87.9955525  0.9145673 62.5705160 37.1808984 53.6025011 41.0382366
##  [91] 49.9874846 40.7637917 77.0942763 89.7443141 56.0984569 93.7367888 40.9559245 39.1081787 85.2435287 84.5483092
plot(x = ru)

order(ru)
##   [1]  63  86  49  42  40  82  54  66  69  76  60  17  53   8  51  36  34  19  44  47  58  10  14  24  23   5   1   7  68
##  [30]  31  12  15  65  21  75  88  33   2  61  98  92  97  90  50  45   3  72  62  18  91  64  29  89  71  95  30  83  87
##  [59]  27  79  16  73  55  80  28  25  37  81  77  22  32  67  48  93   9  74  26   6  35 100  99  39  46  85  41  94  57
##  [88]  13   4  70  96  38  84  43  52  11  78  56  20  59
ru<- ru[order(ru)]
ru
##   [1]  0.3093203  0.9145673  2.0012858  4.3780108  4.7170781  7.2369264  8.1484995  8.2922413  9.4769845 10.8434889
##  [11] 11.5258160 11.5976153 12.2359721 20.3099027 20.5014186 20.8601917 21.2366523 21.7748875 23.0411222 25.1936670
##  [21] 26.2643423 26.3225375 28.0058549 28.8683205 29.7167699 30.1273778 30.6611610 31.3836700 31.7425363 31.7534832
##  [31] 31.7579420 32.9899976 33.3141798 33.3648912 36.2226413 37.1808984 37.7000552 37.7581528 38.2328979 39.1081787
##  [41] 40.7637917 40.9559245 41.0382366 44.2895074 44.5718199 44.6369519 46.3039483 47.8353369 49.5974155 49.9874846
##  [51] 50.6883806 51.8882331 53.6025011 54.1254316 56.0984569 59.0100412 59.2928890 62.5705160 62.7604530 62.9788998
##  [61] 63.8690058 65.6800678 67.5579701 69.1037814 71.2424416 72.0192496 72.0566853 72.2492234 72.7475125 74.1665106
##  [71] 76.0787503 76.6554664 77.0035673 77.0942763 78.3595440 82.5013874 82.8169597 83.3922241 83.4972339 84.5483092
##  [81] 85.2435287 85.3899123 85.7918534 87.9955525 89.1204730 89.7443141 91.3148499 92.2288819 92.2754464 92.5003056
##  [91] 93.7367888 93.7418408 93.9160067 93.9815697 94.2575307 94.9645460 95.1926798 95.3943237 96.9399880 99.2416064
plot(x = ru)

# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd
##   [1] 54.45525 49.03749 58.21386 44.97842 41.59408 45.71510 37.70555 49.26733 50.15641 54.05202 64.89148 70.97971
##  [13] 56.59788 48.55293 43.30324 48.19839 55.54799 52.95595 61.44560 56.07902 52.19002 54.71211 43.03501 40.71543
##  [25] 28.59788 43.27259 32.12516 43.28700 39.12524 50.22703 34.09889 53.60533 57.85443 51.12432 44.52551 44.64574
##  [37] 50.55251 36.23684 62.75045 61.24054 68.41510 45.24892 58.42543 39.22376 41.05827 42.17714 38.31808 53.42938
##  [49] 49.55842 58.73995 39.76204 62.64747 27.54293 69.31948 58.21910 34.19830 45.22325 49.08144 44.60484 37.74953
##  [61] 57.70110 39.79260 56.36047 51.05901 68.88025 39.92626 60.89881 47.35415 51.22348 59.81139 62.90206 42.48141
##  [73] 32.60398 55.64562 65.57340 55.96418 56.21406 53.37021 52.29871 56.65959 45.43434 52.07080 23.10998 47.87532
##  [85] 58.55255 50.05128 50.82405 40.89086 59.29257 50.38877 66.40796 65.32602 42.76152 57.08165 49.90578 40.01788
##  [97] 29.86540 43.61912 57.28409 53.98526
nd <- nd[order(nd)]
nd
##   [1] 23.10998 27.54293 28.59788 29.86540 32.12516 32.60398 34.09889 34.19830 36.23684 37.70555 37.74953 38.31808
##  [13] 39.12524 39.22376 39.76204 39.79260 39.92626 40.01788 40.71543 40.89086 41.05827 41.59408 42.17714 42.48141
##  [25] 42.76152 43.03501 43.27259 43.28700 43.30324 43.61912 44.52551 44.60484 44.64574 44.97842 45.22325 45.24892
##  [37] 45.43434 45.71510 47.35415 47.87532 48.19839 48.55293 49.03749 49.08144 49.26733 49.55842 49.90578 50.05128
##  [49] 50.15641 50.22703 50.38877 50.55251 50.82405 51.05901 51.12432 51.22348 52.07080 52.19002 52.29871 52.95595
##  [61] 53.37021 53.42938 53.60533 53.98526 54.05202 54.45525 54.71211 55.54799 55.64562 55.96418 56.07902 56.21406
##  [73] 56.36047 56.59788 56.65959 57.08165 57.28409 57.70110 57.85443 58.21386 58.21910 58.42543 58.55255 58.73995
##  [85] 59.29257 59.81139 60.89881 61.24054 61.44560 62.64747 62.75045 62.90206 64.89148 65.32602 65.57340 66.40796
##  [97] 68.41510 68.88025 69.31948 70.97971
plot(x = nd)

hist(x = nd)

hist(nd, breaks = 20, col = "darkgreen")

plot(x = density(nd))

boxplot(x = nd)

boxplot(x = nd, horizontal = T)


ggplot2

Lets be honest, the base plots are ugly! The ggplot2 package gives the user to create a better, more visually appealing plots. Additional packages such as ggbeeswarm and ggrepel also contain useful functions to add to the functionality of ggplot2.

library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()

mp + geom_point(aes(color = x3, shape = x3), size = 4)

mp + geom_line(size = 2)

mp + geom_line(aes(color = x3), size = 2)

mp + geom_smooth(method = "loess")

mp + geom_smooth(method = "lm")

xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
                          rnorm(50, mean = 60, sd = 5)),
                 group = factor(rep(1:2, each = 50)),
                 label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")

mp + geom_histogram(color = "black", position = "dodge")

mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1

mp + geom_density(alpha = 0.5)

mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")

mp + geom_boxplot() + geom_point()

mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")

library(ggbeeswarm)
mp + geom_quasirandom()

mp + geom_quasirandom(aes(shape = group))

mp2 <- mp + geom_violin() + 
  geom_boxplot(width = 0.1, fill = "white") +
  geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)

library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
          common.legend = T, legend = "bottom")


Statistics

# Prep data
lev_Loc  <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
  mutate(Location = factor(Location, levels = lev_Loc),
         Name = factor(Name, levels = lev_Name))
xx <- dd %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
##   <fct>                       <dbl>                 <dbl>              <dbl>
## 1 ILL 618 AGL                  47                    79.3               138.
## 2 CDC Maxim AGL                52.5                  86.7               134.
## 3 Laird AGL                    56.8                  76.8               137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
  geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF, 
                      color = Name, group = Name, shape = Name)) +
  geom_point(size = 2.5, alpha = 0.7) + 
  geom_line(size = 1, alpha = 0.7) +
  theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")

From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.

summary(aov(DTF ~ Name * Location, data = dd))
##               Df Sum Sq Mean Sq  F value   Pr(>F)    
## Name           2     88      44    3.476   0.0395 *  
## Location       2  65863   32931 2598.336  < 2e-16 ***
## Name:Location  4    560     140   11.044 2.52e-06 ***
## Residuals     45    570      13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.

If we only have two things to compare, we could do a t-test.

xx <- dd %>% 
  filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
  spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -32.18265 -25.48402
## sample estimates:
## mean of x mean of y 
##  52.11111  80.94444

DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.

xx <- dd %>% 
  filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
         Location == "Metaponto, Italy") %>%
  spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.059739  7.059739
## sample estimates:
## mean of x mean of y 
##  137.8333  136.8333

DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.


pch Plot

xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
                 y = rep(5:1, each = 6, length.out = 26),
                 pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
  geom_point(color = "darkred", fill = "darkblue", size = 5) +
  geom_text(aes(label = pch), nudge_x = -0.25) +
  scale_shape_manual(values = xx$pch) +
  scale_x_continuous(breaks = 6:1) +
  scale_y_continuous(breaks = 6:1) +
  theme_void() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(title = "Plot symbols in R (pch)",
       subtitle = "color = \"darkred\", fill = \"darkblue\"",
       x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")


R Markdown

Tutorials on how to create an R markdown document like this one can be found here:



dblogr/


© Derek Michael Wright