Replicated experiments are well known to have the highest power for statistical inference. However, this has rarely been properly implemented in field studies and numerous techniques have been devised to alleviate this challenge while avoiding psuedoreplication (Eberhardt and Thomas 1991). Due to the power (lower typeII error) of large replicate studies it would be desirable to develop a method for conducting such studies in field experiments so as to derive stronger inferences. Higher power is of equal interest to Bayesian analysis as it is to frequentest but discussion of either is outside the scope of this work save that it benefits all. The purpose of this project is to develop a tool for alleviating the first step which is identifying plausible replicate sites by using existing locations. It is possible to deal with uncontrolled variables through large numbers of replicates but the more uncontrolled and higher variation the variables are the more replicates are needed. This tool uses the large amounts of environmental data widely available to filter through locations and identify a desired number of replicates which are most similar given a set of variables. This tool is designed to be dynamic by allowing for multiple spatial data sets to be imported and desired variables to be filtered for.
Datasets:
These datasets were chosen to deomnstrate this tool becuase they are readily available and are focus locally. The tax parcels data allows us to determine property ownsership, acreage, value, contact information, and other attributes. A better option that this point file is a shape file of the parcels which is available upon request from the county. A shape file would allow more accurate attribution of other variables such as soil and climate. The brownfields data is chosen for demonstrations of how this tool could be used to identify potential brownfields with similar enough variables to allow use as replicates in related studies. Additoinal data could come from USDA, USGS, NOAA, and NASA depending on variables of interest.
Shiny code (in progress version at https://mitchhitch.shinyapps.io/prjkt/)
##render is reactive
library(shiny)
library(leaflet)
library(dplyr)
library(tools)
library(sf)
library(sp)
library(tidyverse)
library(doParallel)
library(nngeo)#st_nn
library(stringr)
ui <- fluidPage(
#Make into tabs
navbarPage(
"Best Rep",
tabPanel("Step 1",
#Upload file1
h2('Upload your files'),
textOutput("at least one must be a .shp file (.csv with headers and .shp only"),
fileInput('file1', 'This must be a shp'),
#upload file2
fileInput('file2', 'Choose second file'),
#select number of reps desired
numericInput("n_reps","How many replicates would you like?",value=30,min=2,max=300)
),
tabPanel("Step 2",
#Select which columns you care about backend in server
uiOutput("SelectCols1"),
uiOutput("SelectCols2")
),
tabPanel("Step 3",
#Fixed variables within the columns
#Run Button
actionButton("RUN", "Click and walk away")
),
tabPanel("Results",
#leaflet
#Table
leafletOutput("bestmap")
)
)
)
server <- function(input, output,session) {
options(shiny.maxRequestSize=30000*1024^2)
#################funcz
#####This should be run AFTER columns have been trimmed
TransCoord_csv<-function(File1,FileX,FileX_Lon="Longitude",FileX_Lat="Latitude",FileX_crs=4326){
FileX<-st_as_sf(FileX,coords=c(FileX_Lon,FileX_Lat)) %>%
st_set_crs(FileX_crs) %>%
st_transform(st_crs(File1)) %>%
st_crop(st_bbox(File1))
FileX
}
### Join all files into 1 sf ONLY run afer TransCoord_csv or st_transform for shape files.
All_F<-function(File1,File2){
# OtherF<-list(...)
# countF<-length(OtherF)
MainF<-File1
# for(i in 1:countF){
# MainF=st_join(OtherF[[1]],MainF,join=st_nn,k=1,maxdist=500) %>%
# aggregate( by=list(.[,1]),FUN=unique,do_union=TRUE,join = st_intersects)
# }
File2<-File2
MainF=st_join(File2,MainF,join=st_nn,k=1,maxdist=500) %>%
aggregate( by=list(.$Program.Facility.Name),FUN=unique,do_union=TRUE,join = st_intersects)
MainF
}
###runs the itterative ranking
Run.score<-function(shapeobj_trim,number){
number=number
foreach(r=1:nrow(shapeobj_trim))%dopar%{
x<-shapeobj_trim %>%
select(-geometry)
x_ret<-shapeobj_trim
for(i in 1:nrow(x)){
x_ret[i,"Score"]<-score.calc(x[r,],x[i,])
}
x_ret<-head(x_ret[order(x_ret$Score),],n=number)
x_ret
}
}
###This ranks the similarit of rows
score.calc<-function(rowprime,rowtest){
colscore<-vector(length=length(rowprime))
for(i in 1:ncol(rowprime)){
prime<-rowprime[i]
if(is.list(prime)){
prime<-unlist(prime)
}
test<-rowtest[i]
if(is.list(test)){
test<-unlist(test)
}
if(length(prime)>1||length(test)>1){
colscore[i]<-as.double(length(c(
setdiff(prime,test),
setdiff(test,prime)))
)
}
else if(is.double(prime)||is.numeric(prime)){
colscore[i]<-as.double(abs((prime-test)/(prime+test)))
}
else if(is.factor(prime)||is.character(prime)){
if(prime==test){
colscore[i]<-0
}
else{
colscore[i]<-1
}
}
else
colscore[i]<-NA
}
return(sum(colscore))
}
###Select best sets of sites
Out_best<-function(possible_n){
#Determine min possible value for sets
minimum<-foreach(i=1:length(possible_n),.combine=c)%dopar%{
sum(possible_n[[i]]$Score)
} %>%
min()
#Select sets with min possible value
best<-foreach(i=1:length(possible_n))%dopar%{
if(sum(possible_n[[i]]$Score)==minimum){
return(possible_n[[i]])
}
} %>%
compact()
best
}
##########################3
###File1
data1<- reactive({
withProgress(message = 'Loading Files', value = 0, {
n<-4
incProgress(1/n, detail = paste("Uploading", 1))
filein <- input$file1
if (is.null(filein)) {
return()
}
else if (file_ext(filein$datapath)=="csv"){
incProgress(1/n, detail = paste("Reading csv", 2))
data = read.csv(file=filein$datapath)
incProgress(1/n, detail = paste("csv Read", 3))
return(data)
}
else if (!is.null(filein)){
incProgress(1/n, detail = paste("Unzipping", 2))
prevWD <- getwd()
unlink("unzipped", recursive = TRUE)
dir.create("unzipped")
tempdir <- "unzipped"
unzip(filein$datapath, exdir = tempdir)
setwd(tempdir)
if(length(list.dirs(recursive=FALSE))>0){
setwd(list.dirs(recursive = FALSE))
}
else{
print("in correct wd")
}
shpName <- list.files()[grep(list.files(),pattern="*.shp")[1]]
incProgress(1/n, detail = paste("Reading shape file", 3))
shpFile <- st_read(shpName)
return(shpFile)
}
else {
return()
}
incProgress(1/n, detail = paste("Read", 4))
})
})
###File2
data2 <- reactive({
withProgress(message = 'Loading Files', value = 0, {
n<-4
incProgress(1/n, detail = paste("Uploading", 1))
filein <- input$file2
if (is.null(filein)) {
return()
}
else if (file_ext(filein$datapath)=="csv"){
incProgress(1/n, detail = paste("Reading csv", 2))
data = read.csv(file=filein$datapath)
incProgress(1/n, detail = paste("csv Read", 3))
return(data)
}
else if (!is.null(filein)){
incProgress(1/n, detail = paste("Unzipping", 2))
prevWD <- getwd()
unlink("unzipped", recursive = TRUE)
dir.create("unzipped")
tempdir <- "unzipped"
unzip(filein$datapath, exdir = tempdir)
setwd(tempdir)
if(length(list.dirs(recursive=FALSE))>0){
setwd(list.dirs(recursive = FALSE))
}
else{
print("in correct wd")
}
shpName <- list.files()[grep(list.files(),pattern="*.shp")[1]]
incProgress(1/n, detail = paste("Reading shape file", 3))
shpFile <- st_read(shpName)
return(shpFile)
}
else {
return()
}
incProgress(1/n, detail = paste("Read", 4))
})
})
##This is just a demo table for File1
# output$table_display <- renderTable({
# f <- data1()
# f <- select(f, c(input$columns1)) #subsetting takes place here
# head(f)
# })
###This allows for selecting desired columns
##File1
output$SelectCols1 <- renderUI({
selectInput("columns1", "Select Columns", choices= names(data1()),multiple = TRUE)
})
##File2
output$SelectCols2 <- renderUI({
selectInput("columns2", "Select Columns", choices= names(data2()),multiple = TRUE)
})
RealRun<-eventReactive(input$RUN,{
withProgress(message = 'Running Comparison', value = 0, {
# Number of steps
n <- 10
req(input$columns1)
req(input$columns2)
number<-input$n_reps
incProgress(1/n, detail = paste("Doing part", 1))
registerDoParallel(cores = 6)
incProgress(1/n, detail = paste("Doing part", 2))
incProgress(1/n, detail = paste("Doing part", 3))
data1_trim<-select(data1(), input$columns1)
incProgress(1/n, detail = paste("Doing part", 4))
rm(data1())
data2_trim<-select(data2(), input$columns2)
incProgress(1/n, detail = paste("Doing part", 5))
rm(data2())
data2_tran<-TransCoord_csv(data1_trim,data2_trim)
incProgress(1/n, detail = paste("Doing part", 6))
rm(data2_trim)
data_ALL<-All_F(data1_trim,data2_tran)
incProgress(1/n, detail = paste("Doing part", 7))
rm(data1_trim,data2_tran)
Ranked_list<-Run.score(data_ALL,number)
incProgress(1/n, detail = paste("Doing part", 8))
rm(data_ALL)
best<-Out_best(Ranked_list)
incProgress(1/n, detail = paste("Doing part", 9))
rm(Ranked_list)
first<-st_transform(best[[1]],'+proj=longlat +datum=WGS84')
incProgress(1/n, detail = paste("Doing part", 10))
first
})
})
output$bestmap <- renderLeaflet({
leaflet() %>%
addTiles() %>%
addCircleMarkers(data = st_geometry(RealRun()), popup=first$Program.Facility.Name)
})
#observe( on button press run the script from the values in place or set defaults.)
}
shinyApp(ui = ui, server = server)
Core Functions:
Transform crs and set bounding box to same area.
TransCoord_csv<-function(File1,FileX,FileX_Lon="Longitude",FileX_Lat="Latitude",FileX_crs=4326){
FileX<-st_as_sf(FileX,coords=c(FileX_Lon,FileX_Lat)) %>%
st_set_crs(FileX_crs) %>%
st_transform(st_crs(File1)) %>%
st_crop(st_bbox(File1))
FileX
}
Join all uploaded files together.
All_F<-function(File1,...){
OtherF<-list(...)
countF<-length(OtherF)
MainF<-File1
for(i in 1:countF){
MainF=st_join(OtherF[[1]],MainF,join=st_nn,k=1,maxdist=500) %>%
aggregate( by=list(.[,1]),FUN=unique,do_union=TRUE,join = st_intersects)
}
}
Rank all the rows based on each field. This function compares each row with every other row and gives a score on how similar they are. Currently factors are weihted as 1 the same or 0 different.
score.calc<-function(rowprime,rowtest){
colscore<-vector(length=length(rowprime))
for(i in 1:ncol(rowprime)){
prime<-rowprime[i]
if(is.list(prime)){
prime<-unlist(prime)
}
test<-rowtest[i]
if(is.list(test)){
test<-unlist(test)
}
if(length(prime)>1||length(test)>1){
colscore[i]<-as.double(length(c(
setdiff(prime,test),
setdiff(test,prime)))
)
}
else if(is.double(prime)||is.numeric(prime)){
colscore[i]<-as.double(abs((prime-test)/(prime+test)))
}
else if(is.factor(prime)||is.character(prime)){
if(prime==test){
colscore[i]<-0
}
else{
colscore[i]<-1
}
}
else
colscore[i]<-NA
}
return(sum(colscore))
}
Iterate over every possible combination and combine into a list of each sites best matches. This is defined as the sites where their scores are the lowest when added together.
Run.score<-function(shapeobj_trim){
foreach(r=1:nrow(shapeobj_trim))%dopar%{
x<-shapeobj_trim %>%
select(-geometry)
x_ret<-shapeobj_trim
for(i in 1:nrow(x)){
x_ret[i,"Score"]<-score.calc(x[r,],x[i,])
}
x_ret<-head(x_ret[order(x_ret$Score),],n=number)
x_ret
}
}
Select the best site combination (ties acceptable).
Out_best<-function(possible_n){
#Determine min possible value for sets
minimum<-foreach(i=1:length(possible_n),.combine=c)%dopar%{
sum(possible_n[[i]]$Score)
} %>%
min()
#Select sets with min possible value
best<-foreach(i=1:length(possible_n))%dopar%{
if(sum(possible_n[[i]]$Score)==minimum){
return(possible_n[[i]])
}
} %>%
compact()
best
}
The leaflet map and table show the sites that are most similar based on the selected variable fields. In the table to the far right you can see the scores of each site relative to the first site. The higher the score the more different they are. Factors and text are evaluated as 0 if they are the same or 1 if they are different.
Although this tool is in very early stages it is promising. The flexibility of this tool will allow it to go beyond the example given here. Another possible use in consideration is identifying locations in a greenhouse with least amount of wind, light, and temperature variability. A major drawback is that this code as it stands is computationally intensive and prohibitive for use on personal computers. A solution to this would be to optimize the algorithms and/or host on a server.
Eberhardt, L. L., and J. M. Thomas. 1991. Designing Environmental Field Studies. Ecological Monographs 61:53–73.
R Core Team (2019). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
Pebesma, E., 2018. Simple Features for R: Standardized Support for Spatial Vector Data. The R Journal 10 (1), 439-446, https://doi.org/10.32614/RJ-2018-009
Pebesma, E.J., R.S. Bivand, 2005. Classes and methods for spatial data in R. R News 5 (2), https://cran.r-project.org/doc/Rnews/.
Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2019). dplyr: A Grammar of Data Manipulation. R package version 0.8.3. https://CRAN.R-project.org/package=dplyr
Wickham et al., (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686
Microsoft Corporation and Steve Weston (2019). doParallel: Foreach Parallel Adaptor for the ‘parallel’ Package. R package version 1.0.15. https://CRAN.R-project.org/package=doParallel
Michael Dorman (2019). nngeo: k-Nearest Neighbor Join for Spatial Data. R package version 0.2.9. https://CRAN.R-project.org/package=nngeo
Joe Cheng, Bhaskar Karambelkar and Yihui Xie (2019). leaflet: Create Interactive Web Maps with the JavaScript ‘Leaflet’ Library. R package version 2.0.3. https://CRAN.R-project.org/package=leaflet
Yihui Xie (2019). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.26.
Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition. Chapman and Hall/CRC. ISBN 978-1498716963
Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible Research in R. In Victoria Stodden, Friedrich Leisch and Roger D. Peng, editors, Implementing Reproducible Computational Research. Chapman and Hall/CRC. ISBN 978-1466561595
Yihui Xie, Joe Cheng and Xianying Tan (2019). DT: A Wrapper of the JavaScript Library ‘DataTables’. R package version 0.10. https://CRAN.R-project.org/package=DT
Edzer Pebesma (2019). lwgeom: Bindings to Selected ‘liblwgeom’ Functions for Simple Features. R package version 0.1-7. https://CRAN.R-project.org/package=lwgeom
R. Kyle Bocinsky (2019). FedData: Functions to Automate Downloading Geospatial Data Available from Several Federated Data Sources. R package version 2.5.7. https://CRAN.R-project.org/package=FedData
Robert J. Hijmans (2019). raster: Geographic Data Analysis and Modeling. R package version 3.0-7. https://CRAN.R-project.org/package=raster
Hadley Wickham (2011). The Split-Apply-Combine Strategy for Data Analysis. Journal of Statistical Software, 40(1), 1-29. URL http://www.jstatsoft.org/v40/i01/.
#Data
Menne, Matthew J., Imke Durre, Bryant Korzeniewski, Shelley McNeal, Kristy Thomas, Xungang Yin, Steven Anthony, Ron Ray, Russell S. Vose, Byron E.Gleason, and Tamara G. Houston (2012): Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. NOAA National Climatic Data Center. doi:10.7289/V5D21VHZ
Erie County Office Of Geographic Information Services. https://www2.erie.gov/gis (Parcel data available upon request)