dblogr/

R Tutorial

An introduction to R


Introduction

This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .

R Project Logo
R Project Logo

Calculator

can be used as a super awesome calculator

# 5 + 3 = 8
5 + 3 
## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) 
## [1] 8
# 2 * 2 * 2 = 8
2^3 
## [1] 8
# 8 * 8 = 64
sqrt(64) 
## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) 
## [1] 8

Functions

has many useful built in functions

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
as.character(1:10)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rep(1:2, times = 5)
##  [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)
##  [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)
##  [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)
## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)
## [1]  5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")
##  [1] "1-20"  "2-21"  "3-22"  "4-23"  "5-24"  "6-25"  "7-26"  "8-27"  "9-28"  "10-29" "1-30"
paste(1:10, collapse = "-")
## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)
##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10"
min(1:10)
## [1] 1
max(1:10)
## [1] 10
range(1:10)
## [1]  1 10
mean(1:10)
## [1] 5.5
sd(1:10)
## [1] 3.02765

Custom Functions

Users can also create their own functions

customFunction1 <- function(x, y) {
  z <- 100 * x / (x + y)
  paste(z, "%")
}
customFunction1(x = 10, y = 90)
## [1] "10 %"
customFunction2 <- function(x) {
  mymin <- mean(x - sd(x))
  mymax <- mean(x) + sd(x)
  print(paste("Min =", mymin))
  print(paste("Max =", mymax))
}
customFunction2(x = 1:10)
## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"

for loops and if else statements

xx <- NULL #creates and empty object
for(i in 1:10) {
  xx[i] <- i*3
}
xx
##  [1]  3  6  9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2
##  [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
  if((xx[i] %% 2) == 0) {
    print(paste(xx[i],"is Even"))
  } else { 
      print(paste(xx[i],"is Odd")) 
    }
}
## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")
##  [1] "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))
##  [1] "3 is Odd"   "6 is Even"  "9 is Odd"   "12 is Even" "15 is Odd"  "18 is Even" "21 is Odd"  "24 is Even" "27 is Odd"  "30 is Even"

Objects

Information can be stored in user defined objects, in multiple forms:

  • c(): a string of values
  • matrix(): a two dimensional matrix in one format
  • data.frame(): a two dimensional matrix where each column can be a different format
  • list():

A string…

xc <- 1:10
xc
##  [1]  1  2  3  4  5  6  7  8  9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc
##  [1]  1  2  3  4  5  6  7  8  9 10

A matrix…

xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100

A data frame…

xd <- data.frame(
  x1 = c("aa","bb","cc","dd","ee",
         "ff","gg","hh","ii","jj"),
  x2 = 1:10,
  x3 = c(1,1,1,1,1,2,2,2,3,3),
  x4 = rep(c(1,2), times = 5),
  x5 = rep(1:5, times = 2),
  x6 = rep(1:5, each = 2),
  x7 = seq(5, 50, by = 5),
  x8 = log10(1:10),
  x9 = (1:10)^3,
  x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

A list…

xl <- list(xc, xm, xd)
xl[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
xl[[2]]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100
xl[[3]]
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

Selecting Data

xc[5] # 5th element in xc
## [1] 5
xd$x3[5] # 5th element in col "x3"
## [1] 1
xd[5,"x3"] # row 5, col "x3"
## [1] 1
xd$x3 # all of col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols
##   x1 x2 x3 x4 x5 x6 x7        x8 x9  x10
## 3 cc  3  1  1  3  2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"
##   x4 x5
## 2  2  2
## 4  2  4
xl[[3]]$x1 # 3rd object in the list, col "x1
##  [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"

regexpr

xx <- data.frame(Name = c("Item 1 (detail 1)",
                          "Item 20 (detail 20)",
                          "Item 300 (detail 300)"),
                 Item = NA,
                 Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx
##                    Name     Item     Detail
## 1     Item 1 (detail 1)   Item 1   detail 1
## 2   Item 20 (detail 20)  Item 20  detail 20
## 3 Item 300 (detail 300) Item 300 detail 300

Data Formats

Data can also be saved in many formats:

  • numeric
  • integer
  • character
  • factor
  • logical
xd$x3 <- as.character(xd$x3)
xd$x3
##  [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10
##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0
##  [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)
## [1] 5

Internal structure of an object can be checked with str()

str(xc) # c()
##  num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()
##  int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()
## 'data.frame':    10 obs. of  10 variables:
##  $ x1 : chr  "aa" "bb" "cc" "dd" ...
##  $ x2 : int  1 2 3 4 5 6 7 8 9 10
##  $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
##  $ x4 : num  1 2 1 2 1 2 1 2 1 2
##  $ x5 : int  1 2 3 4 5 1 2 3 4 5
##  $ x6 : int  1 1 2 2 3 3 4 4 5 5
##  $ x7 : num  5 10 15 20 25 30 35 40 45 50
##  $ x8 : num  0 0.301 0.477 0.602 0.699 ...
##  $ x9 : num  1 8 27 64 125 216 343 512 729 1000
##  $ x10: logi  TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()
## List of 3
##  $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
##  $ :'data.frame':    10 obs. of  10 variables:
##   ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
##   ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
##   ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
##   ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
##   ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
##   ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
##   ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
##   ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
##   ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...

Packages

Additional libraries can be installed and loaded for use.

install.packages("scales")
library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx
##    Values  Rescaled
## 1       1  1.000000
## 2       2  4.222222
## 3       3  7.444444
## 4       4 10.666667
## 5       5 13.888889
## 6       6 17.111111
## 7       7 20.333333
## 8       8 23.555556
## 9       9 26.777778
## 10     10 30.000000

libraries can also be used without having to load them

scales::rescale(1:10, to = c(1,30))
##  [1]  1.000000  4.222222  7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000

Data Wrangling

R for Data Science - https://r4ds.had.co.nz/

xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
                 Data1 = 1:10, 
                 Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      X     6    60       66     6000
## 7      X     7    70       77     7000
## 8      X     8    80       88     8000
## 9      Y     9    90       99     9000
## 10     Y    10   100      110    10000
xx$Data1 < 5 # which are less than 5
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]
##   Group Data2 NewData1
## 1     X    10       11
## 2     X    20       22
## 6     X    60       66
## 7     X    70       77
## 8     X    80       88

Data wrangling with tidyverse and pipes (%>%)

library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
  mutate(Data1 = 1:10, 
         Data2 = seq(10, 100, by = 10),
         NewData1 = Data1 + Data2,
         NewData2 = Data1 * 1000)
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      Y     6    60       66     6000
## 7      Y     7    70       77     7000
## 8      X     8    80       88     8000
## 9      X     9    90       99     9000
## 10     X    10   100      110    10000
filter(xx, Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Group == "X") %>% 
  select(Group, NewColName=Data2, NewData1)
##   Group NewColName NewData1
## 1     X         10       11
## 2     X         20       22
## 3     X         80       88
## 4     X         90       99
## 5     X        100      110
xs <- xx %>% 
  group_by(Group) %>% 
  summarise(Data2_mean = mean(Data2),
            Data2_sd = sd(Data2),
            NewData2_mean = mean(NewData2),
            NewData2_sd = sd(NewData2))
xs
## # A tibble: 2 × 5
##   Group Data2_mean Data2_sd NewData2_mean NewData2_sd
##   <chr>      <dbl>    <dbl>         <dbl>       <dbl>
## 1 X             60     41.8          6000       4183.
## 2 Y             50     15.8          5000       1581.
xx %>% left_join(xs, by = "Group")
##    Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1      X     1    10       11     1000         60 41.83300          6000    4183.300
## 2      X     2    20       22     2000         60 41.83300          6000    4183.300
## 3      Y     3    30       33     3000         50 15.81139          5000    1581.139
## 4      Y     4    40       44     4000         50 15.81139          5000    1581.139
## 5      Y     5    50       55     5000         50 15.81139          5000    1581.139
## 6      Y     6    60       66     6000         50 15.81139          5000    1581.139
## 7      Y     7    70       77     7000         50 15.81139          5000    1581.139
## 8      X     8    80       88     8000         60 41.83300          6000    4183.300
## 9      X     9    90       99     9000         60 41.83300          6000    4183.300
## 10     X    10   100      110    10000         60 41.83300          6000    4183.300

Read/Write data

xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)

For excel sheets, the package readxl can be used to read in sheets of data.

library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")

Tidy Data

yy <- xx %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = round(mean(DTF),1)) %>% 
  arrange(Location)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          Location            Mean_DTF
##   <chr>         <chr>                  <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh     86.7
## 2 ILL 618 AGL   Jessore, Bangladesh     79.3
## 3 Laird AGL     Jessore, Bangladesh     76.8
## 4 CDC Maxim AGL Metaponto, Italy       134. 
## 5 ILL 618 AGL   Metaponto, Italy       138. 
## 6 Laird AGL     Metaponto, Italy       137. 
## 7 CDC Maxim AGL Saskatoon, Canada       52.5
## 8 ILL 618 AGL   Saskatoon, Canada       47  
## 9 Laird AGL     Saskatoon, Canada       56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
##   <chr>                         <dbl>              <dbl>               <dbl>
## 1 CDC Maxim AGL                  86.7               134.                52.5
## 2 ILL 618 AGL                    79.3               138.                47  
## 3 Laird AGL                      76.8               137.                56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          TraitName           Value
##   <chr>         <chr>               <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh  86.7
## 2 ILL 618 AGL   Jessore, Bangladesh  79.3
## 3 Laird AGL     Jessore, Bangladesh  76.8
## 4 CDC Maxim AGL Metaponto, Italy    134. 
## 5 ILL 618 AGL   Metaponto, Italy    138. 
## 6 Laird AGL     Metaponto, Italy    137. 
## 7 CDC Maxim AGL Saskatoon, Canada    52.5
## 8 ILL 618 AGL   Saskatoon, Canada    47  
## 9 Laird AGL     Saskatoon, Canada    56.8
yy <- yy %>% spread(key = Name, value = Value)
yy
## # A tibble: 3 × 4
##   TraitName           `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
##   <chr>                         <dbl>         <dbl>       <dbl>
## 1 Jessore, Bangladesh            86.7          79.3        76.8
## 2 Metaponto, Italy              134.          138.        137. 
## 3 Saskatoon, Canada              52.5          47          56.8

Base Plotting

We will start with some basic plotting using the base function plot()

# A basic scatter plot
plot(x = xd$x8, y = xd$x9)

# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)

plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)

# Adjust plot type 
plot(x = xd$x8, y = xd$x9, type = "line")

# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)

# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")

Now lets create some random and normally distributed data to make some more complicated plots

# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru
##   [1] 52.4567842 14.4607305 63.0414284 90.0664584 13.2141307 76.3469897 47.3484875  7.9576793 85.7559131  6.3583335 68.1490463 63.7086920 61.0282004  9.3563430
##  [15] 37.9924400 89.0397243 54.2681623 47.6845592 63.8047448 98.5050661 52.0340927 78.6843823 88.9231618 71.3631471 70.6649341 47.8113493  3.2189372 34.1418219
##  [29]  4.0474825 14.5953050 24.2460846 15.8094645 71.9718612 54.2105661 94.4825361 66.6023209 31.8099895 61.8989584 81.5266673  0.7799857 84.5652930 90.6022149
##  [43] 42.1854804 78.7180656 59.4582704 61.3639501 93.4714213 29.5930198 38.3888683 67.8368875  6.2433405 56.9435118 55.5889328 86.2686391  0.3003284  5.9010673
##  [57] 85.9629560 36.4819246 64.6454849 31.5907379 56.5036105 14.0766348  0.1024257 54.1649926 88.8596795 64.3005397 92.5568730 73.0302329 87.1608658 90.8249962
##  [71] 56.2670870 48.4250783 93.8677824  9.6053920 62.9929613 91.2244791 18.4178129 63.5584403 95.0778790 14.5908599 67.8644038 79.1334338 79.6510022 69.2340696
##  [85] 88.7618304 46.3271626 39.6145151 99.8495667 19.5244256 70.3485446 22.0628922 55.4643681 74.0608019 23.2167145 66.2087798 65.2212020 17.9152488 85.0634494
##  [99] 32.6578594 10.9571187
plot(x = ru)

order(ru)
##   [1]  63  55  40  27  29  56  51  10   8  14  74 100   5  62   2  80  30  32  97  77  89  91  94  31  48  60  37  99  28  58  15  49  87  43  86   7  18  26  72
##  [40]  21   1  64  34  17  92  53  71  61  52  45  13  46  38  75   3  78  12  19  66  59  96  95  36  50  81  11  84  90  25  24  33  68  93   6  22  44  82  83
##  [79]  39  41  98   9  57  54  69  85  65  23  16   4  42  70  76  67  47  73  35  79  20  88
ru<- ru[order(ru)]
ru
##   [1]  0.1024257  0.3003284  0.7799857  3.2189372  4.0474825  5.9010673  6.2433405  6.3583335  7.9576793  9.3563430  9.6053920 10.9571187 13.2141307 14.0766348
##  [15] 14.4607305 14.5908599 14.5953050 15.8094645 17.9152488 18.4178129 19.5244256 22.0628922 23.2167145 24.2460846 29.5930198 31.5907379 31.8099895 32.6578594
##  [29] 34.1418219 36.4819246 37.9924400 38.3888683 39.6145151 42.1854804 46.3271626 47.3484875 47.6845592 47.8113493 48.4250783 52.0340927 52.4567842 54.1649926
##  [43] 54.2105661 54.2681623 55.4643681 55.5889328 56.2670870 56.5036105 56.9435118 59.4582704 61.0282004 61.3639501 61.8989584 62.9929613 63.0414284 63.5584403
##  [57] 63.7086920 63.8047448 64.3005397 64.6454849 65.2212020 66.2087798 66.6023209 67.8368875 67.8644038 68.1490463 69.2340696 70.3485446 70.6649341 71.3631471
##  [71] 71.9718612 73.0302329 74.0608019 76.3469897 78.6843823 78.7180656 79.1334338 79.6510022 81.5266673 84.5652930 85.0634494 85.7559131 85.9629560 86.2686391
##  [85] 87.1608658 88.7618304 88.8596795 88.9231618 89.0397243 90.0664584 90.6022149 90.8249962 91.2244791 92.5568730 93.4714213 93.8677824 94.4825361 95.0778790
##  [99] 98.5050661 99.8495667
plot(x = ru)

# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd
##   [1] 66.18949 63.41009 41.67724 49.98171 47.35647 41.68363 47.43977 50.07419 53.91462 47.58579 67.50219 49.97480 50.24869 55.32699 55.93569 51.23054 46.75811
##  [18] 44.81280 47.25225 51.13797 48.28200 45.84032 39.37441 72.51446 42.99257 62.67297 47.68889 44.13250 72.70050 56.17772 38.79266 55.89314 45.94121 51.46065
##  [35] 40.85170 56.13606 48.89583 54.04488 54.90023 31.62306 41.31319 61.69971 51.98148 50.49855 40.81442 54.33608 57.89450 58.83683 32.10535 45.35180 40.37400
##  [52] 53.36390 60.67670 60.09875 51.30494 69.40725 57.56831 57.79490 75.04703 48.02566 57.23173 66.93459 62.29333 59.95504 57.11141 45.63096 41.76165 52.32158
##  [69] 49.68994 68.49633 40.63804 48.47787 61.69562 45.67371 32.53627 42.80216 50.97950 43.77436 45.47084 59.74376 58.21021 43.08065 54.39976 50.30703 51.86042
##  [86] 39.74156 59.60286 50.49736 39.36414 68.96489 61.68858 29.85843 38.73917 67.27284 39.75639 73.77903 54.91714 47.86674 34.11580 38.12225
nd <- nd[order(nd)]
nd
##   [1] 29.85843 31.62306 32.10535 32.53627 34.11580 38.12225 38.73917 38.79266 39.36414 39.37441 39.74156 39.75639 40.37400 40.63804 40.81442 40.85170 41.31319
##  [18] 41.67724 41.68363 41.76165 42.80216 42.99257 43.08065 43.77436 44.13250 44.81280 45.35180 45.47084 45.63096 45.67371 45.84032 45.94121 46.75811 47.25225
##  [35] 47.35647 47.43977 47.58579 47.68889 47.86674 48.02566 48.28200 48.47787 48.89583 49.68994 49.97480 49.98171 50.07419 50.24869 50.30703 50.49736 50.49855
##  [52] 50.97950 51.13797 51.23054 51.30494 51.46065 51.86042 51.98148 52.32158 53.36390 53.91462 54.04488 54.33608 54.39976 54.90023 54.91714 55.32699 55.89314
##  [69] 55.93569 56.13606 56.17772 57.11141 57.23173 57.56831 57.79490 57.89450 58.21021 58.83683 59.60286 59.74376 59.95504 60.09875 60.67670 61.68858 61.69562
##  [86] 61.69971 62.29333 62.67297 63.41009 66.18949 66.93459 67.27284 67.50219 68.49633 68.96489 69.40725 72.51446 72.70050 73.77903 75.04703
plot(x = nd)

hist(x = nd)

hist(nd, breaks = 20, col = "darkgreen")

plot(x = density(nd))

boxplot(x = nd)

boxplot(x = nd, horizontal = T)


ggplot2

Lets be honest, the base plots are ugly! The ggplot2 package gives the user to create a better, more visually appealing plots. Additional packages such as ggbeeswarm and ggrepel also contain useful functions to add to the functionality of ggplot2.

library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()

mp + geom_point(aes(color = x3, shape = x3), size = 4)

mp + geom_line(size = 2)

mp + geom_line(aes(color = x3), size = 2)

mp + geom_smooth(method = "loess")

mp + geom_smooth(method = "lm")

xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
                          rnorm(50, mean = 60, sd = 5)),
                 group = factor(rep(1:2, each = 50)),
                 label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")

mp + geom_histogram(color = "black", position = "dodge")

mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1

mp + geom_density(alpha = 0.5)

mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")

mp + geom_boxplot() + geom_point()

mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")

library(ggbeeswarm)
mp + geom_quasirandom()

mp + geom_quasirandom(aes(shape = group))

mp2 <- mp + geom_violin() + 
  geom_boxplot(width = 0.1, fill = "white") +
  geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)

library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
          common.legend = T, legend = "bottom")


Statistics

# Prep data
lev_Loc  <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
  mutate(Location = factor(Location, levels = lev_Loc),
         Name = factor(Name, levels = lev_Name))
xx <- dd %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
##   <fct>                       <dbl>                 <dbl>              <dbl>
## 1 ILL 618 AGL                  47                    79.3               138.
## 2 CDC Maxim AGL                52.5                  86.7               134.
## 3 Laird AGL                    56.8                  76.8               137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
  geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF, 
                      color = Name, group = Name, shape = Name)) +
  geom_point(size = 2.5, alpha = 0.7) + 
  geom_line(size = 1, alpha = 0.7) +
  theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")

From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.

summary(aov(DTF ~ Name * Location, data = dd))
##               Df Sum Sq Mean Sq  F value   Pr(>F)    
## Name           2     88      44    3.476   0.0395 *  
## Location       2  65863   32932 2598.336  < 2e-16 ***
## Name:Location  4    560     140   11.044 2.52e-06 ***
## Residuals     45    570      13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.

If we only have two things to compare, we could do a t-test.

xx <- dd %>% 
  filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
  spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -32.18265 -25.48402
## sample estimates:
## mean of x mean of y 
##  52.11111  80.94444

DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.

xx <- dd %>% 
  filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
         Location == "Metaponto, Italy") %>%
  spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.059739  7.059739
## sample estimates:
## mean of x mean of y 
##  137.8333  136.8333

DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.


pch Plot

xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
                 y = rep(5:1, each = 6, length.out = 26),
                 pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
  geom_point(color = "darkred", fill = "darkblue", size = 5) +
  geom_text(aes(label = pch), nudge_x = -0.25) +
  scale_shape_manual(values = xx$pch) +
  scale_x_continuous(breaks = 6:1) +
  scale_y_continuous(breaks = 6:1) +
  theme_void() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(title = "Plot symbols in R (pch)",
       subtitle = "color = \"darkred\", fill = \"darkblue\"",
       x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")


R Markdown

Tutorials on how to create an R markdown document like this one can be found here:



dblogr/


© Derek Michael Wright