dblogr.com/

R Tutorial

An introduction to R


Introduction

This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .

R Project Logo
R Project Logo

Calculator

can be used as a super awesome calculator

# 5 + 3 = 8
5 + 3 
## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) 
## [1] 8
# 2 * 2 * 2 = 8
2^3 
## [1] 8
# 8 * 8 = 64
sqrt(64) 
## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) 
## [1] 8

Functions

has many useful built in functions

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
as.character(1:10)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rep(1:2, times = 5)
##  [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)
##  [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)
##  [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)
## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)
## [1]  5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")
##  [1] "1-20"  "2-21"  "3-22"  "4-23"  "5-24"  "6-25"  "7-26"  "8-27"  "9-28"  "10-29" "1-30"
paste(1:10, collapse = "-")
## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)
##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10"
min(1:10)
## [1] 1
max(1:10)
## [1] 10
range(1:10)
## [1]  1 10
mean(1:10)
## [1] 5.5
sd(1:10)
## [1] 3.02765

Custom Functions

Users can also create their own functions

customFunction1 <- function(x, y) {
  z <- 100 * x / (x + y)
  paste(z, "%")
}
customFunction1(x = 10, y = 90)
## [1] "10 %"
customFunction2 <- function(x) {
  mymin <- mean(x - sd(x))
  mymax <- mean(x) + sd(x)
  print(paste("Min =", mymin))
  print(paste("Max =", mymax))
}
customFunction2(x = 1:10)
## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"

for loops and if else statements

xx <- NULL #creates and empty object
for(i in 1:10) {
  xx[i] <- i*3
}
xx
##  [1]  3  6  9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2
##  [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
  if((xx[i] %% 2) == 0) {
    print(paste(xx[i],"is Even"))
  } else { 
      print(paste(xx[i],"is Odd")) 
    }
}
## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")
##  [1] "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))
##  [1] "3 is Odd"   "6 is Even"  "9 is Odd"   "12 is Even" "15 is Odd"  "18 is Even" "21 is Odd"  "24 is Even" "27 is Odd"  "30 is Even"

Objects

Information can be stored in user defined objects, in multiple forms:

  • c(): a string of values
  • matrix(): a two dimensional matrix in one format
  • data.frame(): a two dimensional matrix where each column can be a different format
  • list():

A string…

xc <- 1:10
xc
##  [1]  1  2  3  4  5  6  7  8  9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc
##  [1]  1  2  3  4  5  6  7  8  9 10

A matrix…

xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100

A data frame…

xd <- data.frame(
  x1 = c("aa","bb","cc","dd","ee",
         "ff","gg","hh","ii","jj"),
  x2 = 1:10,
  x3 = c(1,1,1,1,1,2,2,2,3,3),
  x4 = rep(c(1,2), times = 5),
  x5 = rep(1:5, times = 2),
  x6 = rep(1:5, each = 2),
  x7 = seq(5, 50, by = 5),
  x8 = log10(1:10),
  x9 = (1:10)^3,
  x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

A list…

xl <- list(xc, xm, xd)
xl[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
xl[[2]]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100
xl[[3]]
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

Selecting Data

xc[5] # 5th element in xc
## [1] 5
xd$x3[5] # 5th element in col "x3"
## [1] 1
xd[5,"x3"] # row 5, col "x3"
## [1] 1
xd$x3 # all of col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols
##   x1 x2 x3 x4 x5 x6 x7        x8 x9  x10
## 3 cc  3  1  1  3  2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"
##   x4 x5
## 2  2  2
## 4  2  4
xl[[3]]$x1 # 3rd object in the list, col "x1
##  [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"

regexpr

xx <- data.frame(Name = c("Item 1 (detail 1)",
                          "Item 20 (detail 20)",
                          "Item 300 (detail 300)"),
                 Item = NA,
                 Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx
##                    Name     Item     Detail
## 1     Item 1 (detail 1)   Item 1   detail 1
## 2   Item 20 (detail 20)  Item 20  detail 20
## 3 Item 300 (detail 300) Item 300 detail 300

Data Formats

Data can also be saved in many formats:

  • numeric
  • integer
  • character
  • factor
  • logical
xd$x3 <- as.character(xd$x3)
xd$x3
##  [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10
##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0
##  [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)
## [1] 5

Internal structure of an object can be checked with str()

str(xc) # c()
##  num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()
##  int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()
## 'data.frame':    10 obs. of  10 variables:
##  $ x1 : chr  "aa" "bb" "cc" "dd" ...
##  $ x2 : int  1 2 3 4 5 6 7 8 9 10
##  $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
##  $ x4 : num  1 2 1 2 1 2 1 2 1 2
##  $ x5 : int  1 2 3 4 5 1 2 3 4 5
##  $ x6 : int  1 1 2 2 3 3 4 4 5 5
##  $ x7 : num  5 10 15 20 25 30 35 40 45 50
##  $ x8 : num  0 0.301 0.477 0.602 0.699 ...
##  $ x9 : num  1 8 27 64 125 216 343 512 729 1000
##  $ x10: logi  TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()
## List of 3
##  $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
##  $ :'data.frame':    10 obs. of  10 variables:
##   ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
##   ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
##   ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
##   ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
##   ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
##   ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
##   ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
##   ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
##   ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...

Packages

Additional libraries can be installed and loaded for use.

install.packages("scales")
library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx
##    Values  Rescaled
## 1       1  1.000000
## 2       2  4.222222
## 3       3  7.444444
## 4       4 10.666667
## 5       5 13.888889
## 6       6 17.111111
## 7       7 20.333333
## 8       8 23.555556
## 9       9 26.777778
## 10     10 30.000000

libraries can also be used without having to load them

scales::rescale(1:10, to = c(1,30))
##  [1]  1.000000  4.222222  7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000

Data Wrangling

R for Data Science - https://r4ds.had.co.nz/

xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
                 Data1 = 1:10, 
                 Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      X     6    60       66     6000
## 7      X     7    70       77     7000
## 8      X     8    80       88     8000
## 9      Y     9    90       99     9000
## 10     Y    10   100      110    10000
xx$Data1 < 5 # which are less than 5
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]
##   Group Data2 NewData1
## 1     X    10       11
## 2     X    20       22
## 6     X    60       66
## 7     X    70       77
## 8     X    80       88

Data wrangling with tidyverse and pipes (%>%)

library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
  mutate(Data1 = 1:10, 
         Data2 = seq(10, 100, by = 10),
         NewData1 = Data1 + Data2,
         NewData2 = Data1 * 1000)
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      Y     6    60       66     6000
## 7      Y     7    70       77     7000
## 8      X     8    80       88     8000
## 9      X     9    90       99     9000
## 10     X    10   100      110    10000
filter(xx, Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Group == "X") %>% 
  select(Group, NewColName=Data2, NewData1)
##   Group NewColName NewData1
## 1     X         10       11
## 2     X         20       22
## 3     X         80       88
## 4     X         90       99
## 5     X        100      110
xs <- xx %>% 
  group_by(Group) %>% 
  summarise(Data2_mean = mean(Data2),
            Data2_sd = sd(Data2),
            NewData2_mean = mean(NewData2),
            NewData2_sd = sd(NewData2))
xs
## # A tibble: 2 × 5
##   Group Data2_mean Data2_sd NewData2_mean NewData2_sd
##   <chr>      <dbl>    <dbl>         <dbl>       <dbl>
## 1 X             60     41.8          6000       4183.
## 2 Y             50     15.8          5000       1581.
xx %>% left_join(xs, by = "Group")
##    Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1      X     1    10       11     1000         60 41.83300          6000    4183.300
## 2      X     2    20       22     2000         60 41.83300          6000    4183.300
## 3      Y     3    30       33     3000         50 15.81139          5000    1581.139
## 4      Y     4    40       44     4000         50 15.81139          5000    1581.139
## 5      Y     5    50       55     5000         50 15.81139          5000    1581.139
## 6      Y     6    60       66     6000         50 15.81139          5000    1581.139
## 7      Y     7    70       77     7000         50 15.81139          5000    1581.139
## 8      X     8    80       88     8000         60 41.83300          6000    4183.300
## 9      X     9    90       99     9000         60 41.83300          6000    4183.300
## 10     X    10   100      110    10000         60 41.83300          6000    4183.300

Read/Write data

xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)

For excel sheets, the package readxl can be used to read in sheets of data.

library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")

Tidy Data

yy <- xx %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = round(mean(DTF),1)) %>% 
  arrange(Location)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          Location            Mean_DTF
##   <chr>         <chr>                  <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh     86.7
## 2 ILL 618 AGL   Jessore, Bangladesh     79.3
## 3 Laird AGL     Jessore, Bangladesh     76.8
## 4 CDC Maxim AGL Metaponto, Italy       134. 
## 5 ILL 618 AGL   Metaponto, Italy       138. 
## 6 Laird AGL     Metaponto, Italy       137. 
## 7 CDC Maxim AGL Saskatoon, Canada       52.5
## 8 ILL 618 AGL   Saskatoon, Canada       47  
## 9 Laird AGL     Saskatoon, Canada       56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
##   <chr>                         <dbl>              <dbl>               <dbl>
## 1 CDC Maxim AGL                  86.7               134.                52.5
## 2 ILL 618 AGL                    79.3               138.                47  
## 3 Laird AGL                      76.8               137.                56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          TraitName           Value
##   <chr>         <chr>               <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh  86.7
## 2 ILL 618 AGL   Jessore, Bangladesh  79.3
## 3 Laird AGL     Jessore, Bangladesh  76.8
## 4 CDC Maxim AGL Metaponto, Italy    134. 
## 5 ILL 618 AGL   Metaponto, Italy    138. 
## 6 Laird AGL     Metaponto, Italy    137. 
## 7 CDC Maxim AGL Saskatoon, Canada    52.5
## 8 ILL 618 AGL   Saskatoon, Canada    47  
## 9 Laird AGL     Saskatoon, Canada    56.8
yy <- yy %>% spread(key = Name, value = Value)
yy
## # A tibble: 3 × 4
##   TraitName           `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
##   <chr>                         <dbl>         <dbl>       <dbl>
## 1 Jessore, Bangladesh            86.7          79.3        76.8
## 2 Metaponto, Italy              134.          138.        137. 
## 3 Saskatoon, Canada              52.5          47          56.8

Base Plotting

We will start with some basic plotting using the base function plot()

# A basic scatter plot
plot(x = xd$x8, y = xd$x9)

# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)

plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)

# Adjust plot type 
plot(x = xd$x8, y = xd$x9, type = "line")

# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)

# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")

Now lets create some random and normally distributed data to make some more complicated plots

# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru
##   [1] 88.879050 74.804051  9.222429 92.962485 73.755413 75.987976 90.991146 58.819090 55.604855 29.982284 17.827302 22.312105 13.290178 57.734984
##  [15] 93.525960 43.401278 18.343924 39.402966 99.919277 15.611971 91.399652 99.532382 53.305153 64.669080 57.336445 99.818284 87.775754 26.502441
##  [29] 14.954005 28.430863 10.599146 19.174557 16.042749 78.639427 60.658386 38.460773 55.731751 33.968191 58.631358 92.066694 34.511771 58.245941
##  [43] 18.857746  6.816765 23.201109 84.191163 32.581872 72.794493 83.089383 61.596890 37.522353 69.481788  6.424845  1.251876  4.524658 37.319633
##  [57] 38.608016 49.973830 32.402709 80.363676 32.775312 57.935965  3.226721 81.579147 83.323537 57.552321 14.358912 92.752618 87.822170 21.331402
##  [71] 33.235553 94.519633 26.239475 31.101516  2.859721 30.048752 37.823260 22.483869 72.717898 64.895911 41.932409 47.786928 72.082850 25.493414
##  [85] 81.585791 16.755659 96.527837 13.238543 97.334134 49.112904 99.068328 84.195672  6.538147 82.839428 60.793273 89.023976  9.788489 57.615773
##  [99] 51.951928 13.804465
plot(x = ru)

order(ru)
##   [1]  54  75  63  55  53  93  44   3  97  31  88  13 100  67  29  20  33  86  11  17  43  32  70  12  78  45  84  73  28  30  10  76  74  59  47  61
##  [37]  71  38  41  56  51  77  36  57  18  81  16  82  90  58  99  23   9  37  25  66  98  14  62  42  39   8  35  95  50  24  80  52  83  79  48   5
##  [73]   2   6  34  60  64  85  94  49  65  46  92  27  69   1  96   7  21  40  68   4  15  72  87  89  91  22  26  19
ru<- ru[order(ru)]
ru
##   [1]  1.251876  2.859721  3.226721  4.524658  6.424845  6.538147  6.816765  9.222429  9.788489 10.599146 13.238543 13.290178 13.804465 14.358912
##  [15] 14.954005 15.611971 16.042749 16.755659 17.827302 18.343924 18.857746 19.174557 21.331402 22.312105 22.483869 23.201109 25.493414 26.239475
##  [29] 26.502441 28.430863 29.982284 30.048752 31.101516 32.402709 32.581872 32.775312 33.235553 33.968191 34.511771 37.319633 37.522353 37.823260
##  [43] 38.460773 38.608016 39.402966 41.932409 43.401278 47.786928 49.112904 49.973830 51.951928 53.305153 55.604855 55.731751 57.336445 57.552321
##  [57] 57.615773 57.734984 57.935965 58.245941 58.631358 58.819090 60.658386 60.793273 61.596890 64.669080 64.895911 69.481788 72.082850 72.717898
##  [71] 72.794493 73.755413 74.804051 75.987976 78.639427 80.363676 81.579147 81.585791 82.839428 83.089383 83.323537 84.191163 84.195672 87.775754
##  [85] 87.822170 88.879050 89.023976 90.991146 91.399652 92.066694 92.752618 92.962485 93.525960 94.519633 96.527837 97.334134 99.068328 99.532382
##  [99] 99.818284 99.919277
plot(x = ru)

# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd
##   [1] 54.49035 49.44809 62.21885 65.20615 65.28011 41.27130 43.88087 64.11959 54.45105 36.68999 77.90538 47.80680 34.19055 47.96877 54.51858 57.94266
##  [17] 50.70187 43.03111 64.72154 59.43974 56.61839 57.07286 53.58821 55.22224 46.98327 51.40799 63.60447 46.14905 49.04199 46.24494 76.52737 52.50987
##  [33] 34.91560 45.96542 36.43664 34.02696 48.57096 46.31293 52.27108 45.95917 54.35162 36.16218 50.10473 47.57341 49.38077 59.23931 30.69929 65.46464
##  [49] 49.70954 66.96907 50.12928 48.38812 57.13398 61.76045 46.98061 59.85512 48.83170 36.71813 42.64924 51.72391 54.42429 50.70854 58.10625 53.33037
##  [65] 38.11532 57.00958 58.25164 56.00100 49.92247 67.75222 65.73637 46.80587 36.68390 47.56226 74.18483 70.88867 40.22199 54.87410 44.76316 52.23608
##  [81] 62.55888 57.58060 52.34181 57.25953 63.51743 45.58025 45.56503 34.78257 40.17578 23.84253 40.45918 54.72745 52.52391 41.82776 44.51720 54.13895
##  [97] 39.28781 39.51687 61.06368 43.04546
nd <- nd[order(nd)]
nd
##   [1] 23.84253 30.69929 34.02696 34.19055 34.78257 34.91560 36.16218 36.43664 36.68390 36.68999 36.71813 38.11532 39.28781 39.51687 40.17578 40.22199
##  [17] 40.45918 41.27130 41.82776 42.64924 43.03111 43.04546 43.88087 44.51720 44.76316 45.56503 45.58025 45.95917 45.96542 46.14905 46.24494 46.31293
##  [33] 46.80587 46.98061 46.98327 47.56226 47.57341 47.80680 47.96877 48.38812 48.57096 48.83170 49.04199 49.38077 49.44809 49.70954 49.92247 50.10473
##  [49] 50.12928 50.70187 50.70854 51.40799 51.72391 52.23608 52.27108 52.34181 52.50987 52.52391 53.33037 53.58821 54.13895 54.35162 54.42429 54.45105
##  [65] 54.49035 54.51858 54.72745 54.87410 55.22224 56.00100 56.61839 57.00958 57.07286 57.13398 57.25953 57.58060 57.94266 58.10625 58.25164 59.23931
##  [81] 59.43974 59.85512 61.06368 61.76045 62.21885 62.55888 63.51743 63.60447 64.11959 64.72154 65.20615 65.28011 65.46464 65.73637 66.96907 67.75222
##  [97] 70.88867 74.18483 76.52737 77.90538
plot(x = nd)

hist(x = nd)

hist(nd, breaks = 20, col = "darkgreen")

plot(x = density(nd))

boxplot(x = nd)

boxplot(x = nd, horizontal = T)


ggplot2

Lets be honest, the base plots are ugly! The ggplot2 package gives the user to create a better, more visually appealing plots. Additional packages such as ggbeeswarm and ggrepel also contain useful functions to add to the functionality of ggplot2.

library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()

mp + geom_point(aes(color = x3, shape = x3), size = 4)

mp + geom_line(size = 2)

mp + geom_line(aes(color = x3), size = 2)

mp + geom_smooth(method = "loess")

mp + geom_smooth(method = "lm")

xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
                          rnorm(50, mean = 60, sd = 5)),
                 group = factor(rep(1:2, each = 50)),
                 label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")

mp + geom_histogram(color = "black", position = "dodge")

mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1

mp + geom_density(alpha = 0.5)

mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")

mp + geom_boxplot() + geom_point()

mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")

library(ggbeeswarm)
mp + geom_quasirandom()

mp + geom_quasirandom(aes(shape = group))

mp2 <- mp + geom_violin() + 
  geom_boxplot(width = 0.1, fill = "white") +
  geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)

library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
          common.legend = T, legend = "bottom")


Statistics

# Prep data
lev_Loc  <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
  mutate(Location = factor(Location, levels = lev_Loc),
         Name = factor(Name, levels = lev_Name))
xx <- dd %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
##   <fct>                       <dbl>                 <dbl>              <dbl>
## 1 ILL 618 AGL                  47                    79.3               138.
## 2 CDC Maxim AGL                52.5                  86.7               134.
## 3 Laird AGL                    56.8                  76.8               137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
  geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF, 
                      color = Name, group = Name, shape = Name)) +
  geom_point(size = 2.5, alpha = 0.7) + 
  geom_line(size = 1, alpha = 0.7) +
  theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")

From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.

summary(aov(DTF ~ Name * Location, data = dd))
##               Df Sum Sq Mean Sq  F value   Pr(>F)    
## Name           2     88      44    3.476   0.0395 *  
## Location       2  65863   32932 2598.336  < 2e-16 ***
## Name:Location  4    560     140   11.044 2.52e-06 ***
## Residuals     45    570      13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.

If we only have two things to compare, we could do a t-test.

xx <- dd %>% 
  filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
  spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -32.18265 -25.48402
## sample estimates:
## mean of x mean of y 
##  52.11111  80.94444

DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.

xx <- dd %>% 
  filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
         Location == "Metaponto, Italy") %>%
  spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.059739  7.059739
## sample estimates:
## mean of x mean of y 
##  137.8333  136.8333

DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.


pch Plot

xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
                 y = rep(5:1, each = 6, length.out = 26),
                 pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
  geom_point(color = "darkred", fill = "darkblue", size = 5) +
  geom_text(aes(label = pch), nudge_x = -0.25) +
  scale_shape_manual(values = xx$pch) +
  scale_x_continuous(breaks = 6:1) +
  scale_y_continuous(breaks = 6:1) +
  theme_void() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(title = "Plot symbols in R (pch)",
       subtitle = "color = \"darkred\", fill = \"darkblue\"",
       x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")


R Markdown

Tutorials on how to create an R markdown document like this one can be found here:



dblogr.com/


© Derek Michael Wright