improving/speeding up a very large, slow simulation
Hi Ben, I appreciate that the example is reproducible, and I understand why you shared the entire project. At the same time, that is an awful lot of code for volunteers to go through and try to optimize. I would try to think of the structure of your task, see what the individual pieces are---figure out what can be reused and optimized. Look at loops and try to think whether some operation performed on every element of the vector at once could do the same thing. Sometimes the next iteration of a loop depends on the previous, so it is difficult to vectorize, but often that is not the case. Make use of the compiler package, especially cmpfun(). It can give fairly substantial speedups, particularly with for loops in R. Finally if you want 1000 reps, do you actually need to repeat all the steps 1000 times, or could you just simulate 1000x the data? If the latter, do the steps once but make more data. If the former, you ought to be able to parallelize it fairly easily, see the parallel package. Good luck, Josh On Mon, Feb 11, 2013 at 9:22 PM, Benjamin Caldwell
<btcaldwell at berkeley.edu> wrote:
Dear R help;
I'll preface this by saying that the example I've provided below is pretty
long, turgid, and otherwise a deep dive into a series of functions I wrote
for a simulation study. It is, however, reproducible and self-contained.
I'm trying to do my first simulation study that's quite big, and so I'll
say that the output of this simulation as I'd like it to be is 9.375
million rows (9375 combinations of the inputs * 1000). So, first thing,
maybe I should try to cut down on the number of dimensions and do it a
piece at a time. If that's a conclusion people come to here, I'll look into
paring down/doing it a chunk at a time.
I'm hoping, though, that this is an opportunity for me to learn some more
efficient, elegant ways to a. vectorize and b. use the apply functions,
which still seem to elude me when it comes to their use with more complex,
custom functions that have multiple inputs.
If having looked the below over you can give me suggestions to help make
this and future code I write run more quickly/efficiently, I will be
most grateful, as as this is currently written it would take what looks
like days to run a thousand simulations of each possible combination of
variables of interest.
Best
Ben Caldwell
-----------------------------------------------
#unpaired
verification.plots<-rnorm(30, 100, 10)
# writeClipboard(as.character(verification.plots))
unpaired.test<- function(verification.plots, project.n, project.mean,
project.sd, allowed.deviation, project.acres, alpha=.05){
verification.plots <-as.numeric(as.character(verification.plots))
a <- qnorm(alpha/2)
d <- alpha*project.mean
# verifier plot number
n<-length(verification.plots)
verifier.plot.number <- c(1:n)
#all plots (verifier and project)
all.plots.n <- rep(0,n)
for(i in 1:n){
all.plots.n[i] <- project.n + verifier.plot.number[i]
}
#running mean
X_bar <- rep(1,n)
for(i in 1:n){
X_bar[i]<- mean(verification.plots[1:i])
}
#running sd
sd2 <- NULL
for(i in 2:n){
sd2[i]<-sd(verification.plots[1:i])
}
#Tn
Tn<-NULL
for(i in 2:n){
Tn[i] <- project.mean-X_bar[i]
}
#n_Threshold
n_thresh<-NULL
for(i in 2:n){
n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2)))
}
#Upper
upper <- NULL
for (i in 2:n){
upper[i] <- Tn[i]-d
}
#Lower
lower<- NULL
for(i in 2:n){
lower[i] <- Tn[i]+d
}
#Success.fail
success.fail <- NULL
success.fail.num <- rep(0,n)
if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){
success.fail[1] <- "Pass"
success.fail.num[1] <- 1
} else {
success.fail[1] <- "Fail"
success.fail.num[1] <-0
}
for(i in 2:n){
if(all.plots.n[i]>=n_thresh[i]){
if(upper[i] <= 0){
if(lower[i] >= 0){
success.fail[i]<-"Pass"
success.fail.num[i]<-1
} else {
success.fail[i] <- "Fail"
success.fail.num[i] <-0}
} else {
success.fail[i] <- "Fail"
success.fail.num[i] <- 0
}
} else {
success.fail[i] <- "Inconclusive"
success.fail.num[i] <- 0
}
}
out.unpaired.test <- data.frame(success.fail, success.fail.num)
}
out.unpaired.test<-unpaired.test(verification.plots=verification.plots,
project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05,
project.acres=99)
out.unpaired.test
#
sequential.unpaired<- function(number.strata, project.n, project.mean,
project.sd, verification.plots,allowed.deviation, project.acres,
min.plots="default", alpha=.05){
if(min.plots=="default"){
min.plots<-NULL # minimum passing
if(number.strata == 1 & project.acres <= 100){
min.plots = 8
} else if(number.strata == 1 & project.acres > 100 & project.acres < 500){
min.plots = 12
} else if(number.strata == 1 & project.acres > 500 & project.acres < 5000){
min.plots = 16
} else if(number.strata == 1 & project.acres > 5000 & project.acres <
10000){
min.plots = 20
} else if(number.strata == 1 & project.acres > 10000){
min.plots = 24
} else if(number.strata == 2 & project.acres < 100){
min.plots = 4
} else if(number.strata == 2 & project.acres > 100 & project.acres < 500){
min.plots = 6
} else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){
min.plots = 8
} else if(number.strata == 2 & project.acres > 5001 & project.acres <
10000){
min.plots = 10
} else if(number.strata == 2 & project.acres > 10000){
min.plots = 12
} else if(number.strata == 3 & project.acres < 100){
min.plots = 2
} else if(number.strata == 3 & project.acres > 100 & project.acres <
500){
min.plots = 3
} else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){
min.plots = 4
} else if(number.strata == 3 & project.acres > 5001 & project.acres <
10000){
min.plots = 5
} else if(number.strata == 3 & project.acres > 10000){
min.plots = 6
} else {min.plots=NULL}
} else (min.plots = min.plots)
if(number.strata > 3){print("max three strata")}
if(number.strata > 3){break}
p <- length(verification.plots)
mp <- min.plots
run <- unpaired.test(verification.plots, project.n, project.mean, project.sd,
allowed.deviation, project.acres, alpha=.05)
number.passing.plots <- function(verification.plots, success.fail.num){
n<-length(verification.plots)
number.passing<-rep(0,n)
number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0)
for(i in 2:n){
if(success.fail.num[i] == 1){
v <- i-1
number.passing[i]<-number.passing[v]+1
} else{number.passing[i] <- 0}
}
number.passing
}
number.passing<-number.passing.plots(verification.plots=verification.plots,
success.fail.num=run$success.fail.num)
sucessful.unsucessful<-rep("blank",p)
one.zero <- rep(0,p)
result <- "blank"
resultL <- 0
n <-length(verification.plots)
for(i in 1:n){
if(number.passing[i] >= mp) {
sucessful.unsucessful[i] <- "successful"
one.zero[i] <- 1
} else {
sucessful.unsucessful[i]<- "unsuccessful"
one.zero[i] <- 0
}
}
if(sum(one.zero) > 0 ) {
result <- "successful"
resultL <- 1
}
plots.success <- function(one.zero){
plots.to.success<-NULL
for(i in 1:n){
if(sum(one.zero)==0){plots.to.success= NA
} else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)}
}
plots.to.success<-min(plots.to.success)
plots.to.success
}
plots.to.success <- plots.success(one.zero=one.zero)
complete <-data.frame (min.plots, number.passing, sucessful.unsucessful,
one.zero)
collapsed <- data.frame(result, resultL, plots.to.success)
out <- c(as.list(complete),as.list(collapsed))
}
unpaired.out<-sequential.unpaired(number.strata=2,
verification.plots=verification.plots, project.n=15, project.mean=100,
project.sd=10, allowed.deviation=.05, project.acres=99, min.plots="default")
str(unpaired.out)
#unpaired.out[[7]][1]
##
simulation.unpaired <- function(reps, project.acreage.range = c(99,
110,510,5100,11000), project.mean, project.n.min, project.n.max,
project.sd.min, project.sd.max, verification.mean.min,
verification.mean.max, number.verification.plots.min,
number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out
= 5, min.plots="default") {
number.strata.range<-c(1:3) # set by CAR
project.n.range <- seq(project.n.min, project.n.max, length.out=length.out)
project.sd.range <- seq(project.sd.min, project.sd.max,
length.out=length.out) # assume verification sd is the same as the project
sd to simplify - seems a reasonable simpification
number.verification.plots<- seq(number.verification.plots.min,
number.verification.plots.max, length.out=length.out)
verification.range <- seq(verification.mean.min, verification.mean.max,
length.out=length.out)
permut.grid<-expand.grid(number.strata.range, project.n.range,
project.acreage.range, project.mean, project.sd.range,
number.verification.plots, verification.range, allowed.deviation) # create
a matrix with all combinations of the supplied vectors
#assign names to the colums of the grid of combinations
names.permut<-c("number.strata", "project.n.plots", "project.acreage",
"project.mean", "project.sd", "number.verification.plots",
"verification.mean", "allowed.deviation")
names(permut.grid)<-names.permut # done
combinations<-length(permut.grid[,1])
size <-reps*combinations #need to know the size of the master matrix, which
is the the number of replications * each combination of the supplied factors
# we want a df from which to read all the data into the simulation, and
record the results
permut.col<-ncol(permut.grid)
col.base<-ncol(permut.grid)+2
base <- matrix(nrow=size, ncol=col.base)
base <-data.frame(base)
# supply the names
names.base<-c("number.strata", "project.n.plots", "project.acreage",
"project.mean", "project.sd", "number.verification.plots",
"verification.mean", "allowed.deviation","success.fail", "plots.to.success")
names(base)<-names.base
# need to create index vectors for the base, master df
ends <- seq(reps+1, size+1, by=reps)
begins <- ends-reps
index <- cbind(begins, ends-1)
#done
# next, need to assign the first 6 columns and number of rows = to the
number of reps in the simulation to be the given row in the permut.grid
matrix
pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done",
min=0, max=100, initial=0)
for (i in 1:combinations) {
base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,]
#progress bar
info <- sprintf("%d%% done", round((i/combinations)*100))
setWinProgressBar(pb, (i/combinations)*100, label=info)
}
close(pb)
# now, simply feed the values replicated the number of times we want to run
the simulation into the sequential.unpaired function, and assign the values
to the appropriate columns
out.index1<-ncol(permut.grid)+1
out.index2<-ncol(permut.grid)+2
#progress bar
pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done",
min=0, max=100, initial=0)
for (i in 1:size){
scalar.base <- base[i,]
verification.plots <- rnorm(scalar.base$number.verification.plots,
scalar.base$verification.mean, scalar.base$project.sd)
result<- sequential.unpaired(scalar.base$number.strata,
scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$
project.sd, verification.plots, scalar.base$allowed.deviation,
scalar.base$project.acreage, min.plots='default', alpha)
base[i,out.index1] <- result[[6]][1]
base[i,out.index2] <- result[[7]][1]
info <- sprintf("%d%% done", round((i/size)*100))
setWinProgressBar(pb, (i/size)*100, label=info)
}
close(pb)
#return the results
return(base)
}
# I would like reps to = 1000, but that takes a really long time right now
test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99,
110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100,
project.sd.min=.01, project.sd.max=.2, verification.mean.min=100,
verification.mean.max=130, number.verification.plots.min=10,
number.verification.plots.max=50, length.out = 5)
[[alternative HTML version deleted]]
______________________________________________ R-help at r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.
-- Joshua Wiley Ph.D. Student, Health Psychology Programmer Analyst II, Statistical Consulting Group University of California, Los Angeles https://joshuawiley.com/