# Scripts included in the paper 
# Rinde et al. 2014. The influence of physical factors on kelp and sea urchin distribution
# in previously and still grazed areas in the NE Atlantic. Plos One.DOI: 10.1371/journal.pone.0100222


# BRT analysis of the kelp Laminaria hyperborea

setwd("......") 
# set the path to the working directory where you place the data file

library(gdata) # library which support import of excelfiles


data_rinde = read.xls("modelldata.xlsx")       # imports the model data as a dataframe
page(data_rinde, method="print") # to have a look at the data
names(data_rinde)
# [1] "DATO"       "RESTORTARE" "PRESSD"     "longres"    "LAT"        "DEM"        "logswm"     "RADOPT"    
# [9] "SLOPE"      "FORS525"    "spmax5r"    "spmin5r"    "saltmax5r"  "tempmean5 

# RESTORTARE contains presence (1) and absence (0) values for recovered L. hyperborea kelp
# PESSD contains presence (1) and absence (0) values for the sea urchin Strongylocentrotus droebachiensis

dim(data_rinde) # 1220 14 

# open libraries applied to perform the following analysis
library(dismo)
library(raster)
library(stats)
library(utils)
library(sp)
library(rgdal)

# Performs cross-validation optimisation of a boosted regression tree (BRT) model 
# for RESTORTARE and using a family of bernoulli 
# Using 1220 observations and 11 predictors 

lh.11pred.gbm <- gbm.step(data = data_rinde,        
                          gbm.x = c(4:14),                  
                          gbm.y = 2,   # response = presence of kelp recovery; called RESTORTARE                                         
                          learning.rate = 0.007,         
                          tree.complexity = 5,
                          family = "bernoulli",
                          bag.fraction = 0.5)


# ProvideS the following summary of the analysis (will vary between different runs):

# fitting final gbm model with a fixed number of  1450  trees for  RESTORTARE 
# mean total deviance = 0.989 
# mean residual deviance = 0.33 
# estimated cv deviance = 0.591 ; se = 0.027 
# training data correlation = 0.86 
# cv correlation =  0.649 ; se = 0.024 
# training data AUC score = 0.981 
# cv AUC score = 0.908 ; se = 0.007 
# elapsed time -  0.47 minutes

summary(lh.11pred.gbm) 
# provides the relative influence of each predictor:


# var   rel.inf
# logswm         logswm 20.662560
# saltmax5r   saltmax5r 11.673489
# FORS525       FORS525 10.737664
# tempmean5r tempmean5r  9.747619
# LAT               LAT  9.038234
# spmin5r       spmin5r  8.155923
# DEM               DEM  8.075935
# spmax5r       spmax5r  7.306100
# longres       longres  5.817268
# SLOPE           SLOPE  4.905121
# RADOPT         RADOPT  3.880087

gbm.plot(lh.11pred.gbm, plot.layout = c(3,4)) # plots the response curves for each predictor variable


lh.11pred.gbm.fitvalues <- lh.11pred.gbm$fitted  # extracts the fitted values
lh.11pred.gbm.residuals <- lh.11pred.gbm$residuals  # extracts the residuals

write.table(lh.11pred.gbm.fitvalues, file = "lh_BRT.11pred_fit.txt")
write.table(lh.11pred.gbm.residuals, file = "lh_BRT.11pred_resid.txt")


# identify interactioons within the model:
int.modellh <-gbm.interactions(lh.11pred.gbm)
#gbm.interactions - version 2.9 
#Cross tabulating interactions for gbm model with 11 predictors
#1  2  3  4  5  6  7  8  9  10 

# View list of interaction effects
int.modellh$rank.list
# var1.index var1.names var2.index var2.names int.size
# 1          4     logswm          3        DEM    63.70
# 2          7    FORS525          3        DEM    53.35
# 3          7    FORS525          4     logswm    20.46
# 4         11 tempmean5r          1    longres    16.70
# 5         11 tempmean5r          3        DEM    13.41
# 6         11 tempmean5r          9    spmin5r    12.06



# 3D plots of the two most pronounced interactions: 
par(mfrow=c(1,1))
gbm.perspec(lh.11pred.gbm,4,3,smooth="average")

par(mfrow=c(1,1))
gbm.perspec(lh.11pred.gbm,7,3,smooth="average")



# GAM models


library(mgcv)  # library for gam analysis

library(MuMIn)  # library for creating model selection tables


# GAM models with interaction between latitude and depth

lh.k3intdemlat <- gam(RESTORTARE ~ s(DEM, bs="cr", k=3) + s(SLOPE, bs="cr", k=3) + s(RADOPT, bs="cr", k=3) +  s(FORS525, bs="cr", k=3) + s(logswm, bs="cr", k=3) + s(DEM,LAT, k=6) + s(LAT, bs="cr", k=3) + s(longres, bs="cr", k=3) + s(tempmean5r, bs="cr", k=3) + s(saltmax5r, bs="cr", k=3) + s(spmax5r, bs="cs") + s(spmin5r, bs="cr", k=3), family = binomial, method = "ML", data=data_rinde)

# creates a subset of models to be exclued in the model selection procedure
msubset <-  expression(!`s(DEM, LAT, k = 6)` | xor(`s(DEM, bs = "cr", k = 3)`| `s(LAT, bs = "cr", k = 3)`, `s(DEM, LAT, k = 6)`))

options(na.action="na.fail") # needs to be set before performing the call to dredge below

dredge.lh.k3intdemlat <- dredge(lh.k3intdemlat, subset = msubset)        # creates MuMIn selection table 

write.table(dredge.lh.k3intdemlat, file = "dredge.lh.k3intdemlat.txt",sep = ";") # writes the selection table to a text file that can be opened in Excel

top.lh.k3intdemlat.dlt4 <- get.models(dredge.lh.k3intdemlat, delta <4)  # create a subset of models with delta AICc <4

n <- length(top.lh.k3intdemlat.dlt4)  # number of models with delta AICc < 4; 5

# create an average model from this subset of models
avg.lh.k3intdemlat.dlt4 <- model.avg(dredge.lh.k3intdemlat, delta <4, se.fit =T)   

imp.top.lh.k3intdemlat.dlt4 <- importance(top.lh.k3intdemlat.dlt4) # calculates the relative importance of each predictor given this subset of models

write.table(imp.top.lh.k3intdemlat.dlt4, file = "imp.top.lh.k3intdemlat.dlt4.txt",sep = ";")

# The syntax to display partial effects for the best model, with lowest AICc value is:
plot(top.lh.k3intdemlat.dlt4$`3998`,all.terms=T, pages=1,scale=0)   
# this model excludes the interaction between latitude and depth, and the predictors longres and radopt. 

summary(top.lh.k3intdemlat.dlt4$`3998`)  # provides the summary of this model


# Models without interaction between latitude and depth; model selection, and model averaging
lh.k3Xintdemlat <- gam(RESTORTARE ~ s(DEM, bs="cr", k=3) + s(SLOPE, bs="cr", k=3) + s(RADOPT, bs="cr", k=3) +  s(FORS525, bs="cr", k=3) + s(logswm, bs="cr", k=3) + s(LAT, bs="cr", k=3) + s(longres, bs="cr", k=3) + s(tempmean5r, bs="cr", k=3) + s(saltmax5r, bs="cr", k=3) + s(spmax5r, bs="cs") + s(spmin5r, bs="cr", k=3), family = binomial, method = "ML", data=data_rinde)


dredge.lh.k3Xintdemlat <- dredge(lh.k3Xintdemlat)        # creates MuMIn selection table 

write.table(dredge.lh.k3Xintdemlat, file = "dredge.lh.k3Xintdemlat.txt",sep = ";") # writes the selection table to a text file that can be opened in Excel

top.dredge.lh.k3Xintdemlat.dlt4 <- get.models(dredge.lh.k3Xintdemlat, delta <4)  # create a subset of models with delta AICc <4

n <- length(top.dredge.lh.k3Xintdemlat.dlt4)  # number of models with delta AICc < 4; 5

# create an average model from this subset of models
avg.lh.k3Xintdemlat.dlt4 <- model.avg(dredge.lh.k3Xintdemlat, delta <4, se.fit =T)   

imp.top.dredge.lh.k3Xintdemlat.dlt4 <- importance(top.dredge.lh.k3Xintdemlat.dlt4) # calculates the relative importance of each predictor given this subset of models

write.table(imp.top.dredge.lh.k3Xintdemlat.dlt4, file = "imp.top.dredge.lh.k3Xintdemlat.dlt4.txt",sep = ";")

# The syntax to display partial effects for the best model, with lowest AICc value is:
plot(top.dredge.lh.k3Xintdemlat.dlt4$`2000`,all.terms=T, pages=1,scale=0)   
# this model excludes the predictors longres and radopt, and is equal to 3998 above

summary(top.dredge.lh.k3Xintdemlat.dlt4$`2000`)  # provides the summary of this model


# EVALUATION WITH TESTDATA


# Read test data

testdta = read.xls("testdata.xlsx")

page(testdta,method="print")  

names(testdta)
# [1] "DATO"       "RESTORTARE" "PRESSD"     "longres"    "LAT"        "DEM"        "logswm"     "RADOPT"    
# [9] "SLOPE"      "FORS525"    "spmax5r"    "spmin5r"    "saltmax5r"  "tempmean5r"

dim(testdta)      
# [1] 403  14       

head(testdta)


# predicts the response of the BRT model based on the predictorvariables in the test data:

predlh.11pred <-  predict(lh.11pred.gbm, n.trees=lh.11pred.gbm$gbm.call$best.trees, testdta, ext=NULL, progress="windows",  type="response")

# predicts the response of the GAM models based on the predictorvariables in the test data:


avgint <- predict(avg.lh.k3intdemlat.dlt4, newdata = testdta, se.fit = TRUE, type="response")
bestGam <- predict(top.lh.k3intdemlat.dlt4$`3998`, newdata = testdta, se.fit = TRUE, type="response")
avgXint <- predict(avg.lh.k3Xintdemlat.dlt4, newdata = testdta, se.fit = TRUE, type="response")

# creates a table with the predicted probability from each model to the observations in the testdata
predtestdataLH <- data.frame(cbind(bestGam$fit, avgint$fit, avgXint$fit, predlh.11pred)) 

write.table(predtestdataLH, file = "predtestdataLH.txt",sep = ";")


# Evaluation of the four established models
# by use of Freeman og Moison (2008)s R package PresenceAbsence

library(PresenceAbsence)


SPDATA = read.xls("final_lhmodels_tested.xlsx")      

# imports a dataset with modelled probability values for the four Lh models to be tested

page(SPDATA,method="print")
head(SPDATA)

#  SPECIES OBSERVED     mod2000      avgint     avgxint        BRT
#1  L. hyp        0 0.006073224 0.005948949 0.005948977 0.01854721
#2  L. hyp        0 0.421973551 0.424368329 0.424368216 0.10990500
#3  L. hyp        0 0.231286917 0.229956316 0.229953907 0.29268415
#4  L. hyp        1 0.627407475 0.629394275 0.629393730 0.80008745
#5  L. hyp        1 0.581360474 0.580508277 0.580508342 0.91973753
#6  L. hyp        1 0.283543501 0.282951165 0.282951184 0.75134566

#defines some variables:

species <- as.character(unique(SPDATA$SPECIES))     
model.names <- as.character(names(SPDATA)[-c(1, 2)])
N.models <- ncol(SPDATA) - 2
N.sp <- length(species)
N.obs <- length(SPDATA$SPECIES[SPDATA$SPECIES == species[1]])
Obs.prev <- table(SPDATA$SPECIES, SPDATA$OBSERVED)[, 2]/N.obs
Obs.prev <- round(Obs.prev, digits = 2)


#4.2. Accuracy tables
#The function presence.absence.accuracy calculates basic accuracy measures (PCC, sensitivity,
#specifcity, Kappa, and AUC) for presence-absence data, and (optionally) the associated
#standard deviations. The PresenceAbsence package uses the method from DeLong et al.
#(1988) to calculate AUC and its associated standard deviation. By default the function will
#calculate all basic accuracy measures at the threshold of 0.5:

sp <- 1
DATA <- SPDATA[SPDATA$SPECIES == species[sp], ]
accu <- presence.absence.accuracy(DATA)
accu[, -c(1, 2)] <- signif(accu[, -c(1, 2)], digits = 2)
print(paste("Species:", species[sp]))
accu

#    model threshold  PCC sensitivity specificity Kappa  AUC PCC.sd sensitivity.sd specificity.sd Kappa.sd AUC.sd
#1 mod2000       0.5 0.87        0.50        0.97  0.55 0.89  0.017          0.052         0.0090    0.052  0.020
#2  avgint       0.5 0.87        0.50        0.97  0.55 0.89  0.017          0.052         0.0090    0.052  0.020
#3 avgxint       0.5 0.87        0.50        0.97  0.55 0.89  0.017          0.052         0.0090    0.052  0.020
#4     BRT       0.5 0.88        0.57        0.97  0.61 0.93  0.016          0.052         0.0095    0.050  0.016


# calculates optimal cut-off thresholds given different options. The youden index is the one called MaxSens+Spec

sp <- "L. hyp"
DATA <- SPDATA[SPDATA$SPECIES == sp, ]
print(paste("Species:", sp))

optimal.thresholds(DATA, opt.methods = 1:12, req.sens = 0.9, req.spec = 0.9, FPC = 2, FNC = 1)
#
#         Method   mod2000    avgint   avgxint       BRT
#1       Default 0.5000000 0.5000000 0.5000000 0.5000000
#2     Sens=Spec 0.2300000 0.2300000 0.2300000 0.1800000
#3  MaxSens+Spec 0.2400000 0.2300000 0.2300000 0.2700000
#4      MaxKappa 0.3500000 0.3500000 0.3500000 0.3100000
#5        MaxPCC 0.4766667 0.4600000 0.4600000 0.3150000
#6  PredPrev=Obs 0.3500000 0.3500000 0.3500000 0.3200000
#7       ObsPrev 0.2282878 0.2282878 0.2282878 0.2282878
#8      MeanProb 0.2026718 0.2026928 0.2026927 0.1946395
#9    MinROCdist 0.2400000 0.2300000 0.2300000 0.1700000
#10      ReqSens 0.0800000 0.0800000 0.0800000 0.1000000
#11      ReqSpec 0.3400000 0.3400000 0.3400000 0.2500000
#12         Cost 0.4850000 0.4850000 0.4850000 0.6200000



# Calibration plots for the models

par(oma = c(0, 5, 0, 0), mar = c(3, 3, 3, 1), mfrow = c(2,2), cex = 0.7, cex.lab = 1.4, mgp = c(2, 0.5, 0))
sp <- c("L. hyp")

for (mod in 1:4) {
  calibration.plot(DATA, which.model = mod, color = mod +
                     1, main = model.names[mod], xlab = "Pred Prob Occ", ylab = "Obs Occ as Prop of Sites Surveyed")
  if (mod == 1) {
    mtext(sp[1], side = 3, line = 6.3, cex = 1.6)
    mtext(paste("(", Obs.prev[names(Obs.prev) == sp[1]],Obs.prev[1],
                "Prevalence )"), side = 3, line = 4.3, cex = 1.3)
  }
}