# Scripts included in the paper # Rinde et al. 2014. The influence of physical factors on kelp and sea urchin distribution # in previously and still grazed areas in the NE Atlantic. Plos One.DOI: 10.1371/journal.pone.0100222 # BRT analysis of the kelp Laminaria hyperborea setwd("......") # set the path to the working directory where you place the data file library(gdata) # library which support import of excelfiles data_rinde = read.xls("modelldata.xlsx") # imports the model data as a dataframe page(data_rinde, method="print") # to have a look at the data names(data_rinde) # [1] "DATO" "RESTORTARE" "PRESSD" "longres" "LAT" "DEM" "logswm" "RADOPT" # [9] "SLOPE" "FORS525" "spmax5r" "spmin5r" "saltmax5r" "tempmean5 # RESTORTARE contains presence (1) and absence (0) values for recovered L. hyperborea kelp # PESSD contains presence (1) and absence (0) values for the sea urchin Strongylocentrotus droebachiensis dim(data_rinde) # 1220 14 # open libraries applied to perform the following analysis library(dismo) library(raster) library(stats) library(utils) library(sp) library(rgdal) # Performs cross-validation optimisation of a boosted regression tree (BRT) model # for RESTORTARE and using a family of bernoulli # Using 1220 observations and 11 predictors lh.11pred.gbm <- gbm.step(data = data_rinde, gbm.x = c(4:14), gbm.y = 2, # response = presence of kelp recovery; called RESTORTARE learning.rate = 0.007, tree.complexity = 5, family = "bernoulli", bag.fraction = 0.5) # ProvideS the following summary of the analysis (will vary between different runs): # fitting final gbm model with a fixed number of 1450 trees for RESTORTARE # mean total deviance = 0.989 # mean residual deviance = 0.33 # estimated cv deviance = 0.591 ; se = 0.027 # training data correlation = 0.86 # cv correlation = 0.649 ; se = 0.024 # training data AUC score = 0.981 # cv AUC score = 0.908 ; se = 0.007 # elapsed time - 0.47 minutes summary(lh.11pred.gbm) # provides the relative influence of each predictor: # var rel.inf # logswm logswm 20.662560 # saltmax5r saltmax5r 11.673489 # FORS525 FORS525 10.737664 # tempmean5r tempmean5r 9.747619 # LAT LAT 9.038234 # spmin5r spmin5r 8.155923 # DEM DEM 8.075935 # spmax5r spmax5r 7.306100 # longres longres 5.817268 # SLOPE SLOPE 4.905121 # RADOPT RADOPT 3.880087 gbm.plot(lh.11pred.gbm, plot.layout = c(3,4)) # plots the response curves for each predictor variable lh.11pred.gbm.fitvalues <- lh.11pred.gbm$fitted # extracts the fitted values lh.11pred.gbm.residuals <- lh.11pred.gbm$residuals # extracts the residuals write.table(lh.11pred.gbm.fitvalues, file = "lh_BRT.11pred_fit.txt") write.table(lh.11pred.gbm.residuals, file = "lh_BRT.11pred_resid.txt") # identify interactioons within the model: int.modellh <-gbm.interactions(lh.11pred.gbm) #gbm.interactions - version 2.9 #Cross tabulating interactions for gbm model with 11 predictors #1 2 3 4 5 6 7 8 9 10 # View list of interaction effects int.modellh$rank.list # var1.index var1.names var2.index var2.names int.size # 1 4 logswm 3 DEM 63.70 # 2 7 FORS525 3 DEM 53.35 # 3 7 FORS525 4 logswm 20.46 # 4 11 tempmean5r 1 longres 16.70 # 5 11 tempmean5r 3 DEM 13.41 # 6 11 tempmean5r 9 spmin5r 12.06 # 3D plots of the two most pronounced interactions: par(mfrow=c(1,1)) gbm.perspec(lh.11pred.gbm,4,3,smooth="average") par(mfrow=c(1,1)) gbm.perspec(lh.11pred.gbm,7,3,smooth="average") # GAM models library(mgcv) # library for gam analysis library(MuMIn) # library for creating model selection tables # GAM models with interaction between latitude and depth lh.k3intdemlat <- gam(RESTORTARE ~ s(DEM, bs="cr", k=3) + s(SLOPE, bs="cr", k=3) + s(RADOPT, bs="cr", k=3) + s(FORS525, bs="cr", k=3) + s(logswm, bs="cr", k=3) + s(DEM,LAT, k=6) + s(LAT, bs="cr", k=3) + s(longres, bs="cr", k=3) + s(tempmean5r, bs="cr", k=3) + s(saltmax5r, bs="cr", k=3) + s(spmax5r, bs="cs") + s(spmin5r, bs="cr", k=3), family = binomial, method = "ML", data=data_rinde) # creates a subset of models to be exclued in the model selection procedure msubset <- expression(!`s(DEM, LAT, k = 6)` | xor(`s(DEM, bs = "cr", k = 3)`| `s(LAT, bs = "cr", k = 3)`, `s(DEM, LAT, k = 6)`)) options(na.action="na.fail") # needs to be set before performing the call to dredge below dredge.lh.k3intdemlat <- dredge(lh.k3intdemlat, subset = msubset) # creates MuMIn selection table write.table(dredge.lh.k3intdemlat, file = "dredge.lh.k3intdemlat.txt",sep = ";") # writes the selection table to a text file that can be opened in Excel top.lh.k3intdemlat.dlt4 <- get.models(dredge.lh.k3intdemlat, delta <4) # create a subset of models with delta AICc <4 n <- length(top.lh.k3intdemlat.dlt4) # number of models with delta AICc < 4; 5 # create an average model from this subset of models avg.lh.k3intdemlat.dlt4 <- model.avg(dredge.lh.k3intdemlat, delta <4, se.fit =T) imp.top.lh.k3intdemlat.dlt4 <- importance(top.lh.k3intdemlat.dlt4) # calculates the relative importance of each predictor given this subset of models write.table(imp.top.lh.k3intdemlat.dlt4, file = "imp.top.lh.k3intdemlat.dlt4.txt",sep = ";") # The syntax to display partial effects for the best model, with lowest AICc value is: plot(top.lh.k3intdemlat.dlt4$`3998`,all.terms=T, pages=1,scale=0) # this model excludes the interaction between latitude and depth, and the predictors longres and radopt. summary(top.lh.k3intdemlat.dlt4$`3998`) # provides the summary of this model # Models without interaction between latitude and depth; model selection, and model averaging lh.k3Xintdemlat <- gam(RESTORTARE ~ s(DEM, bs="cr", k=3) + s(SLOPE, bs="cr", k=3) + s(RADOPT, bs="cr", k=3) + s(FORS525, bs="cr", k=3) + s(logswm, bs="cr", k=3) + s(LAT, bs="cr", k=3) + s(longres, bs="cr", k=3) + s(tempmean5r, bs="cr", k=3) + s(saltmax5r, bs="cr", k=3) + s(spmax5r, bs="cs") + s(spmin5r, bs="cr", k=3), family = binomial, method = "ML", data=data_rinde) dredge.lh.k3Xintdemlat <- dredge(lh.k3Xintdemlat) # creates MuMIn selection table write.table(dredge.lh.k3Xintdemlat, file = "dredge.lh.k3Xintdemlat.txt",sep = ";") # writes the selection table to a text file that can be opened in Excel top.dredge.lh.k3Xintdemlat.dlt4 <- get.models(dredge.lh.k3Xintdemlat, delta <4) # create a subset of models with delta AICc <4 n <- length(top.dredge.lh.k3Xintdemlat.dlt4) # number of models with delta AICc < 4; 5 # create an average model from this subset of models avg.lh.k3Xintdemlat.dlt4 <- model.avg(dredge.lh.k3Xintdemlat, delta <4, se.fit =T) imp.top.dredge.lh.k3Xintdemlat.dlt4 <- importance(top.dredge.lh.k3Xintdemlat.dlt4) # calculates the relative importance of each predictor given this subset of models write.table(imp.top.dredge.lh.k3Xintdemlat.dlt4, file = "imp.top.dredge.lh.k3Xintdemlat.dlt4.txt",sep = ";") # The syntax to display partial effects for the best model, with lowest AICc value is: plot(top.dredge.lh.k3Xintdemlat.dlt4$`2000`,all.terms=T, pages=1,scale=0) # this model excludes the predictors longres and radopt, and is equal to 3998 above summary(top.dredge.lh.k3Xintdemlat.dlt4$`2000`) # provides the summary of this model # EVALUATION WITH TESTDATA # Read test data testdta = read.xls("testdata.xlsx") page(testdta,method="print") names(testdta) # [1] "DATO" "RESTORTARE" "PRESSD" "longres" "LAT" "DEM" "logswm" "RADOPT" # [9] "SLOPE" "FORS525" "spmax5r" "spmin5r" "saltmax5r" "tempmean5r" dim(testdta) # [1] 403 14 head(testdta) # predicts the response of the BRT model based on the predictorvariables in the test data: predlh.11pred <- predict(lh.11pred.gbm, n.trees=lh.11pred.gbm$gbm.call$best.trees, testdta, ext=NULL, progress="windows", type="response") # predicts the response of the GAM models based on the predictorvariables in the test data: avgint <- predict(avg.lh.k3intdemlat.dlt4, newdata = testdta, se.fit = TRUE, type="response") bestGam <- predict(top.lh.k3intdemlat.dlt4$`3998`, newdata = testdta, se.fit = TRUE, type="response") avgXint <- predict(avg.lh.k3Xintdemlat.dlt4, newdata = testdta, se.fit = TRUE, type="response") # creates a table with the predicted probability from each model to the observations in the testdata predtestdataLH <- data.frame(cbind(bestGam$fit, avgint$fit, avgXint$fit, predlh.11pred)) write.table(predtestdataLH, file = "predtestdataLH.txt",sep = ";") # Evaluation of the four established models # by use of Freeman og Moison (2008)s R package PresenceAbsence library(PresenceAbsence) SPDATA = read.xls("final_lhmodels_tested.xlsx") # imports a dataset with modelled probability values for the four Lh models to be tested page(SPDATA,method="print") head(SPDATA) # SPECIES OBSERVED mod2000 avgint avgxint BRT #1 L. hyp 0 0.006073224 0.005948949 0.005948977 0.01854721 #2 L. hyp 0 0.421973551 0.424368329 0.424368216 0.10990500 #3 L. hyp 0 0.231286917 0.229956316 0.229953907 0.29268415 #4 L. hyp 1 0.627407475 0.629394275 0.629393730 0.80008745 #5 L. hyp 1 0.581360474 0.580508277 0.580508342 0.91973753 #6 L. hyp 1 0.283543501 0.282951165 0.282951184 0.75134566 #defines some variables: species <- as.character(unique(SPDATA$SPECIES)) model.names <- as.character(names(SPDATA)[-c(1, 2)]) N.models <- ncol(SPDATA) - 2 N.sp <- length(species) N.obs <- length(SPDATA$SPECIES[SPDATA$SPECIES == species[1]]) Obs.prev <- table(SPDATA$SPECIES, SPDATA$OBSERVED)[, 2]/N.obs Obs.prev <- round(Obs.prev, digits = 2) #4.2. Accuracy tables #The function presence.absence.accuracy calculates basic accuracy measures (PCC, sensitivity, #specifcity, Kappa, and AUC) for presence-absence data, and (optionally) the associated #standard deviations. The PresenceAbsence package uses the method from DeLong et al. #(1988) to calculate AUC and its associated standard deviation. By default the function will #calculate all basic accuracy measures at the threshold of 0.5: sp <- 1 DATA <- SPDATA[SPDATA$SPECIES == species[sp], ] accu <- presence.absence.accuracy(DATA) accu[, -c(1, 2)] <- signif(accu[, -c(1, 2)], digits = 2) print(paste("Species:", species[sp])) accu # model threshold PCC sensitivity specificity Kappa AUC PCC.sd sensitivity.sd specificity.sd Kappa.sd AUC.sd #1 mod2000 0.5 0.87 0.50 0.97 0.55 0.89 0.017 0.052 0.0090 0.052 0.020 #2 avgint 0.5 0.87 0.50 0.97 0.55 0.89 0.017 0.052 0.0090 0.052 0.020 #3 avgxint 0.5 0.87 0.50 0.97 0.55 0.89 0.017 0.052 0.0090 0.052 0.020 #4 BRT 0.5 0.88 0.57 0.97 0.61 0.93 0.016 0.052 0.0095 0.050 0.016 # calculates optimal cut-off thresholds given different options. The youden index is the one called MaxSens+Spec sp <- "L. hyp" DATA <- SPDATA[SPDATA$SPECIES == sp, ] print(paste("Species:", sp)) optimal.thresholds(DATA, opt.methods = 1:12, req.sens = 0.9, req.spec = 0.9, FPC = 2, FNC = 1) # # Method mod2000 avgint avgxint BRT #1 Default 0.5000000 0.5000000 0.5000000 0.5000000 #2 Sens=Spec 0.2300000 0.2300000 0.2300000 0.1800000 #3 MaxSens+Spec 0.2400000 0.2300000 0.2300000 0.2700000 #4 MaxKappa 0.3500000 0.3500000 0.3500000 0.3100000 #5 MaxPCC 0.4766667 0.4600000 0.4600000 0.3150000 #6 PredPrev=Obs 0.3500000 0.3500000 0.3500000 0.3200000 #7 ObsPrev 0.2282878 0.2282878 0.2282878 0.2282878 #8 MeanProb 0.2026718 0.2026928 0.2026927 0.1946395 #9 MinROCdist 0.2400000 0.2300000 0.2300000 0.1700000 #10 ReqSens 0.0800000 0.0800000 0.0800000 0.1000000 #11 ReqSpec 0.3400000 0.3400000 0.3400000 0.2500000 #12 Cost 0.4850000 0.4850000 0.4850000 0.6200000 # Calibration plots for the models par(oma = c(0, 5, 0, 0), mar = c(3, 3, 3, 1), mfrow = c(2,2), cex = 0.7, cex.lab = 1.4, mgp = c(2, 0.5, 0)) sp <- c("L. hyp") for (mod in 1:4) { calibration.plot(DATA, which.model = mod, color = mod + 1, main = model.names[mod], xlab = "Pred Prob Occ", ylab = "Obs Occ as Prop of Sites Surveyed") if (mod == 1) { mtext(sp[1], side = 3, line = 6.3, cex = 1.6) mtext(paste("(", Obs.prev[names(Obs.prev) == sp[1]],Obs.prev[1], "Prevalence )"), side = 3, line = 4.3, cex = 1.3) } }