##
## +-----------------------+
## | |
## | "SPECIES DATA" |
## | LECTURES BY C. WEBB |
## | LECTURE 2 |
## | |
## +-----------------------+
##
## ######################### LECTURE 2: R INTRO #########################
## ### Language structure
## R can be used like a basic calculator, with basic operations:
1
1+1
1/0
1:10
## and functions:
sqrt(2)
rep(1,10)
## ### Getting help
## Help is always available by locating the HTML pages installed with R,
## by typing `help.start()`, or by typing `help(...)` e.g.:
help(cbind)
?cbind
help.start()
## Demonstrations are also available for many packages:
demo(graphics)
## ### Basic Objects
## A symbol (in R, a name containing at least one letter) represents R
## objects. An objects contents is assigned with the `<-` operator
## (though `=` also works):
a <- 1
a # typing just the object’s name prints out the object
a = 1 # alternate syntax
a
a <- 1+1
a
a <- 3 # overwrite it
a
## You can manage objects by listing them and deleting them:
ls()
b <- a
ls()
rm(a)
ls()
## And you can see the command history with:
history()
## R has various data ‘modes’ or data types: logical, character,
## numeric, integer (and lists). When you assign a character to an
## object it then has a different mode from a number:
is.character(b)
c <- d # error, there is no d
c <- "d"
c
is.character(c)
d <- "hello"
d
ls()
## The most basic objects in R are vectors of 0 dimensions and
## up. That is: i) single variables (0 dimensions), ii) a linear
## sequence of variables (aka ‘vector;’ one dimension), then iii) a
## matrix, or array (two dimensions), etc.
length(b)
f <- c(1, 2, 3) # c() means concatenate
f
length(f)
f <- c(1, 2,
4) # commands can span multiple lines
f
c(b, b, 4, 5)
f <- c(f, 25) # objects can appear on both right and left of the `<-`
f
f * 2 # operations on vectors are applied to each element
## String vectors behave similarly:
g <- c( "a" , "b", "z" )
g
i <- "q"
i
g <- c(i, g)
## Note that vector elements can be of only one type, and so, e.g.,
## the integers are ‘coerced’ to character mode:
c(1, "a", 2, "b")
## A _list_ can contain multiple data types, and is itself another
## ‘data mode.’ A non-atomic one. The output of many functions are
## list objects. Parts of the list can be accessed by names (if
## supplied), using a `$` syntax.
list(1, "a", 2, "b")
list(part1=1, part2="a", part3=c(2,1), part4= "b")
list(part1=1, part2="a", part3=c(2,1), part4= "b")$part4
list(part1=1, part2="a", part3=c(2,1), part4= "b")[[3]]
## Elements of a dimensional object are referenced by indices
## (beginning at 1 for the first element)
g
g[2]
g[3]
## Non-specified indexes include all unrestricted elements of the
## object:
g[]
## Dropping elements is done with negative values:
g[-2]
g[-21] # there is no 21st element
g[-1]
## Indexing by a series of indices is also possible:
c(2,3)
g[ c(2,3) ]
## As is indexing by a sequence (2:4 means all integer values between 2 and 4
## inclusive):
g[2:4]
## Subsets of vectors and arrays can be created by including logical
## statements (vectors) as indices.
g != "a" # Logical vector: g elements not equal to "a"
g[ g != "a" ]
g[ (g != "a") ] # parentheses can sometimes help readability
## Matrices or arrays are two dimensional objects.
v1 <- c(1,2,3)
v1
v2 <- c(11,12,13)
v2
cbind(v1, v2) # bind as columns
rbind(v1, v2) # bind as rows
m <- cbind(v1, v2)
m
## For two dimensional objects the first index is the row number, the
## second the column:
m[1,1]
m[1,2]
m[ ,2]
m[2, ]
## Subsetting again: e.g., print all columns of object m, for the rows
## in which v2 is less than 13:
m[,2] < 13
m[ (m[,2] < 13), ]
## **Data frames** are a special form of arrays that can contain columns
## with different types of data (they are actually a form of lists)
v1 # object names can contain numbers
v3 <- c("a", "b", "c")
v3
cbind(v1, v3) # coercion happens again here, but not here:
data.frame(v1, v3)
j.df <- data.frame(v1, v3)
j.df # as objects become more complex, it can
# be helpful to use a suffix (e.g., `.df`)
# to help remember the nature of the object.
## See how the first column is the name of the rows (by default an
## integer), and the first row is the name of the columns (by default
## X and an integer). Column names can be set with:
colnames(j.df) <- c("size","plot")
j.df
j.df[ , 1]
j.df[ 1, ]
j.df$plot
j.df$plot[3]
## An important feature of basic objects is their ‘class,’
## particularly if they are factors or not (factors in the sense of
## experimental independent variables). Factors have levels.
class(j.df$size)
## Nope. However, a character column is also by default a factor, with
## levels:
class(j.df$plot)
is.factor(j.df$plot)
levels(j.df$plot)
## Non-factor vectors can be forced to be factors:
j.df$sizeAsFactor <- as.factor(j.df$size)
j.df
j.df$sizeAsFactor <- NULL # zap it!
# To zap many cols: df[,c("col1","col2")]<-list(NULL)
j.df
## Subsetting data frames is a common activity.
j.df$size > 1
j.df[ (j.df$size > 1) , ]
j.df[ c(2,3) , ]
j.df$plot
j.df$plot != "b"
j.df [ j.df$plot != "b" , ]
## ### Reading in external data
## The import function `read.table` creates a data frame from raw
## text. Most data that you might want to read into R is of a data
## frame (mixed variable types) type. For the following, download the
## file: [mydata.txt](mydata.txt).
read.table("mydata.txt")
read.table("mydata.txt", sep="|")
read.table("mydata.txt", sep="|", header=TRUE)
df2 <- read.table("mydata.txt", sep="|", header=TRUE)
df2
head(df2) # handy! And these:
str(df2)
summary(df2)
attributes(df2)
dim(df2)
## Checking factors (these can also be set in the `read.table` function):
is.factor(df2$moisture)
is.factor(df2$rep)
is.factor(df2$length)
## ### Data manipulation
## At first, one of the most frustrating things about R is how
## difficult it seems to manipulate and reshape data tables. Here are
## a few tips. First, summarize with a table:
table(df2$rep, df2$moisture)
tab1 <- table(df2$rep, df2$moisture)
# A 'table' object must be converted to matrix or DF for most other uses:
as.data.frame(tab1)
as.matrix(tab1)
as.data.frame.matrix(tab1)
## Run a function across some factors:
tapply(df2$length, INDEX=df2$moisture, FUN=mean)
tapply(df2$length, INDEX=list(df2$moisture , df2$rep), FUN=mean)
# Unlike the table function, tapply creates a matrix
## Another way of aggregating:
aggregate(df2$length, by=df2[,c("moisture","rep")], FUN=mean)
## ‘Stacking’ a table:
tab2 <- as.matrix(tab1)
tab2[1,2] <- 0
tab2[3,1] <- 0
tab2
data.frame(expand.grid(dimnames(tab2)), n = as.vector(tab2))
## Joining information from two tables:
df3 <- data.frame(rep=c("a","b","c"), status=c("OK","OK","bust"))
df3
merge(df2, df3, by="rep")
# Alternatively, a version of VLOOKUP():
match(df2$rep, df3$rep)
df2$status <- df3$status[ match(df2$rep, df3$rep) ]
head(df2)
## ---------------------------------------------------------------------
## ## Basic statistics
## ### Data summaries
## Create some random data, and find the mean:
rnorm(n=30, mean=1, sd=1)
mean(rnorm(n=30, mean=1, sd=1))
## See how the sample mean approaches the population mean with larger samples:
mean(rnorm(n=300, mean=1, sd=1))
mean(rnorm(n=3000, mean=1, sd=1))
mean(rnorm(n=30000, mean=1, sd=1))
## Other stats:
my.norm <- rnorm(n=300, mean=1, sd=1)
my.norm
sd(my.norm)
median(my.norm)
summary(my.norm)
## Find the median another way:
sort(my.norm)
sort(my.norm)[150]
sort(my.norm)[151]
median(my.norm) # hmm... not the same. Ahh: median() is interpolated
## Now some **graphics** (a histogram). This will open a new window, a graphics window:
hist(my.norm)
## You can fiddle endlessly with the parameters to make your image
## look good, but here are the basics:
hist(my.norm, main="My histogram", xlab="a continuous variable",
ylab="frequency", sub="subtitle", breaks=25)
## Now an x-y plot. The form is `plot(x,y)`. Alternatively ‘~’ means ‘as
## a function of’.
plot(df2$height ~ df2$fertility)
## Separating the factors. First plot a blank frame:
plot(df2$fertility, df2$height, type="n")
## Then add the points:
points(df2$fertility[df2$moisture=="DRY"],
df2$height[df2$moisture=="DRY"], col = "red")
points(df2$fertility[df2$moisture=="WET"],
df2$height[df2$moisture=="WET"], col = "blue")
## The default graphics device is the screen. To capture a graphic, a
## different device should be specified:
jpeg(filename = "histogram.jpg", width = 480, height = 480,
pointsize = 12, bg = "white")
## Run the plot
hist(my.norm)
## Switch off the device:
dev.off()
## The file is made in the active directory. Other devices include
## **pdf**, ps, png, xfig
## ### Simple tests
## t-test:
x1 <- rnorm(n=100, mean=5, sd=2)
x2 <- rnorm(n=50, mean=6, sd=3)
t.test(x1,x2)
## Alternative formulation:
x <- c(x1,x2)
group <- as.factor( c( rep(1,times=100), rep(2,times=50) ) )
t.test(x ~ group)
plot(x ~ group)
## Note that the two ‘columns’ x and group do not have to belong to the
## same object. They must have the same length though.
## Non-parametric
wilcox.test(x1,x2, paired= FALSE)
## ### Linear models
## Make some data and view it:
x <- rnorm(n=100, mean=5, sd=2)
y <- (x * 0.5) + rnorm(n=100, mean=2, sd=1)
plot(x,y, xlim=c(0,10), ylim=c(0,10))
## `lm()` is the basic least-squares linear model function (i.e. ANOVA
## and regression). The output of lm() is an object:
xy.lm <- lm(y ~ x)
summary(xy.lm)
## Add the regression line:
abline(xy.lm, col="red")
## Now some deviant (heteroscedastic) data of the same type
y <- (x * rnorm(n=100, mean=0.5, sd=0.25)) + 2
xy.lm <- lm(y ~ x)
plot(x,y, xlim=c(0,10), ylim=c(0,10))
abline(xy.lm, col="red")
## Examine the residuals
plot(fitted(xy.lm), resid(xy.lm))
qqnorm(resid(xy.lm))
qqline(resid(xy.lm))
## Test for normality of the residuals:
hist(resid(xy.lm))
shapiro.test(resid(xy.lm))
## But we can still use a non-parametric test:
cor.test(x, y, method = "spearman", alternative="two.sided")
## Now a more complicated model. The formula is `Y ~ X1 + X2 + X1:X1`
## ... two factors and an interaction term:
data.lm <- lm(height ~ fertility + moisture + fertility:moisture, data=data.df)
summary(data.lm)
## The following two sections introduce more complex analyses.
## ### Randomization methods
## Increasingly, modern statistical approaches use randomization and
## re-sampling tests (bootstrapping) to deal with ugly distributions
## and non-normal events. Here is a simple example:
## We define a function which is the mean of the minimum distance between
## the two closed pairs of observations:
meanmindist <- function(x){
z <- as.matrix(dist(x))
z[row(z)==col(z)] <- max(z) # recode the identities
mean( apply(z,1,min) )
}
## We have a universe of possible observations (e.g. body sizes of
## organisms in region)
all <- c(1, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 7)
## We have an observed set of 4 organisms, e.g. in a community:
obs <- c(1, 3, 5, 7)
## Question: is the mean difference between organism size more than
## expected by chance?
obs.md <- meanmindist(obs)
## Let’s sample 1000 cases of 4 organisms:
exp.md <- numeric(1000) # create an empty vector of 1000 slots
for (i in 1:1000) {
exp.md[i] <- meanmindist( sample(all, size=4) )
}
## View the result, with observed line in red:
hist(exp.md)
lines(c(obs.md,obs.md),c(0,par("usr")[4]),col="red")
## Get the quantile (sort of ... note the effect of ties):
match(obs.md, sort(exp.md))
## So the observed distances are greater than expected.
## ### Quitting R
## Finally, to leave R, type:
q()
## If you say yes to the saving data, your workspace will be saved as `.RData` and your history as `.Rhistory`.
## ---------------------------------------------------------------------