Hello R Help,
I am trying solve an MLE convergence problem: I would like to estimate
four parameters, p1, p2, mu1, mu2, which relate to the probabilities,
P1, P2, P3, of a multinomial (trinomial) distribution. I am using the
mle2() function and feeding it a time series dataset composed of four
columns: time point, number of successes in category 1, number of
successes in category 2, and number of success in category 3. The
column headers are: t, n1, n2, and n3.
The mle2() function converges occasionally, and I need to improve the
rate of convergence when used in a stochastic simulation, with multiple
stochastically generated datasets. When mle2() does not converge, it
returns an error: "Error in optim(par = c(2, 2, 0.001, 0.001), fn =
function (p) : L-BFGS-B needs finite values of 'fn'." I am using the
L-BFGS-B optimization method with a lower box constraint of zero for all
four parameters. While I do not know any theoretical upper limit(s) to
the parameter values, I have not seen any parameter estimates above 2
when using empirical data. It seems that when I start with certain
'true' parameter values, the rate of convergence is quite high, whereas
other "true" parameter values are very difficult to estimate. For
example, the true parameter values p1 = 2, p2 = 2, mu1 = 0.001, mu2 =
0.001 causes convergence problems, but the parameter values p1 = 0.3, p2
= 0.3, mu1 = 0.08, mu2 = 0.08 lead to high convergence rate. I've
chosen these two sets of values because they represent the upper and
lower estimates of parameter values derived from graphical methods.
First, do you have any suggestions on how to improve the rate of
convergence and avoid the "finite values of 'fn'" error? Perhaps it has
to do with the true parameter values being so close to the boundary? If
so, any suggestions on how to estimate parameter values that are near zero?
Here is reproducible and relevant code from my stochastic simulation:
########################################################################
library(bbmle)
library(combinat)
# define multinomial distribution
dmnom2 <- function(x,prob,log=FALSE) {
r <- lgamma(sum(x) + 1) + sum(x * log(prob) - lgamma(x + 1))
if (log) r else exp(r)
}
# vector of time points
tv <- 1:20
# Negative log likelihood function
NLL.func <- function(p1, p2, mu1, mu2, y){
t <- y$tv
n1 <- y$n1
n2 <- y$n2
n3 <- y$n3
P1 <- (p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P2 <- (p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P3 <- 1 - P1 - P2
p.all <- c(P1, P2, P3)
-sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE))
}
## Generate simulated data
# Model equations as expressions,
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# True parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
# Generate simulated data set from probabilities
n = rep(20, length(tv))
p = as.matrix(Pdat[,2:4])
y <- as.data.frame(rmultinomial(n,p))
yt <- cbind(tv, y)
names(yt) <- c("tv", "n1", "n2", "n3")
# mle2 call
mle.fit <- mle2(NLL.func, data = list(y = yt),
start = list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t),
control = list(maxit = 5000, factr = 1e-10, lmm = 17),
method = "L-BFGS-B", skip.hessian = TRUE,
lower = list(p1 = 0, p2 = 0, mu1 = 0, mu2 = 0))
###########################################################################
I interpret the error as having to do with the finite difference
approximation failing. If so, perhaps a gradient function would help?
If you agree, I've described my unsuccessful attempt at writing a
gradient function below. If a gradient function is unnecessary, ignore
the remainder of this message.
My gradient function: I derived the gradient function by taking the
derivative of my NLL equation with respect to each parameter. My NLL
equation is the probability mass function of the trinomial distribution.
Thus the gradient equation for, say, parameter p1 would be:
gr.p1 <- deriv(log(P1^n1), p1) + deriv(log(P2^n2), p1) +
deriv(log(P3^n3), p1)
This produces a very large equation, which I won't reproduce here.
Let's say that the four gradient equations for the four parameters are
defined as gr.p1, gr.p2, gr.mu1, gr.mu2, and all are derived as
described above for gr.p1. These gradient equations are functions of
p1, p2, mu1, mu2, t, n1, n2, and n3. My current gradient function is:
grr <- function(p1, p2, mu1, mu2, y){
t <- y[,1]
n1 <- y[,2]
n2 <- y[,3]
n3 <- y[,4]
gr.p1 <- .......
gr.p2 <- .......
gr.mu1 <- .......
gr.mu2 <- .......
c(gr.p1, gr.p2, gr.mu1, gr.mu2)
}
The problem is that I need to supply values for t, n1, n2, and n3 to the
gradient function, which are from the dataset yt, above. When I supply
the dataset yt, the function produces a vector of length 4*nrow(yt) =
80. When I include it in my mle2() function, I get an error that mle2
(optim) requires a vector of length 4. How do I write my gradient
function to work in mle2()?
Any help would be much appreciated.
Adam Zeilinger
Adam Zeilinger
Post Doctoral Scholar
Department of Entomology
University of California Riverside
www.linkedin.com/in/adamzeilinger
Presumably you checked out the CRAN Optimization task view to see if
you could find any joy there, right? (I doubt there is, but ...)
-- Bert
On Thu, Oct 4, 2012 at 10:12 PM, Adam Zeilinger <zeil0006 at umn.edu> wrote:
Hello R Help,
I am trying solve an MLE convergence problem: I would like to estimate four
parameters, p1, p2, mu1, mu2, which relate to the probabilities, P1, P2, P3,
of a multinomial (trinomial) distribution. I am using the mle2() function
and feeding it a time series dataset composed of four columns: time point,
number of successes in category 1, number of successes in category 2, and
number of success in category 3. The column headers are: t, n1, n2, and n3.
The mle2() function converges occasionally, and I need to improve the rate
of convergence when used in a stochastic simulation, with multiple
stochastically generated datasets. When mle2() does not converge, it
returns an error: "Error in optim(par = c(2, 2, 0.001, 0.001), fn = function
(p) : L-BFGS-B needs finite values of 'fn'." I am using the L-BFGS-B
optimization method with a lower box constraint of zero for all four
parameters. While I do not know any theoretical upper limit(s) to the
parameter values, I have not seen any parameter estimates above 2 when using
empirical data. It seems that when I start with certain 'true' parameter
values, the rate of convergence is quite high, whereas other "true"
parameter values are very difficult to estimate. For example, the true
parameter values p1 = 2, p2 = 2, mu1 = 0.001, mu2 = 0.001 causes convergence
problems, but the parameter values p1 = 0.3, p2 = 0.3, mu1 = 0.08, mu2 =
0.08 lead to high convergence rate. I've chosen these two sets of values
because they represent the upper and lower estimates of parameter values
derived from graphical methods.
First, do you have any suggestions on how to improve the rate of convergence
and avoid the "finite values of 'fn'" error? Perhaps it has to do with the
true parameter values being so close to the boundary? If so, any
suggestions on how to estimate parameter values that are near zero?
Here is reproducible and relevant code from my stochastic simulation:
########################################################################
library(bbmle)
library(combinat)
# define multinomial distribution
dmnom2 <- function(x,prob,log=FALSE) {
r <- lgamma(sum(x) + 1) + sum(x * log(prob) - lgamma(x + 1))
if (log) r else exp(r)
}
# vector of time points
tv <- 1:20
# Negative log likelihood function
NLL.func <- function(p1, p2, mu1, mu2, y){
t <- y$tv
n1 <- y$n1
n2 <- y$n2
n3 <- y$n3
P1 <- (p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P2 <- (p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P3 <- 1 - P1 - P2
p.all <- c(P1, P2, P3)
-sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE))
}
## Generate simulated data
# Model equations as expressions,
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# True parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
# Generate simulated data set from probabilities
n = rep(20, length(tv))
p = as.matrix(Pdat[,2:4])
y <- as.data.frame(rmultinomial(n,p))
yt <- cbind(tv, y)
names(yt) <- c("tv", "n1", "n2", "n3")
# mle2 call
mle.fit <- mle2(NLL.func, data = list(y = yt),
start = list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t),
control = list(maxit = 5000, factr = 1e-10, lmm = 17),
method = "L-BFGS-B", skip.hessian = TRUE,
lower = list(p1 = 0, p2 = 0, mu1 = 0, mu2 = 0))
###########################################################################
I interpret the error as having to do with the finite difference
approximation failing. If so, perhaps a gradient function would help? If
you agree, I've described my unsuccessful attempt at writing a gradient
function below. If a gradient function is unnecessary, ignore the remainder
of this message.
My gradient function: I derived the gradient function by taking the
derivative of my NLL equation with respect to each parameter. My NLL
equation is the probability mass function of the trinomial distribution.
Thus the gradient equation for, say, parameter p1 would be:
gr.p1 <- deriv(log(P1^n1), p1) + deriv(log(P2^n2), p1) + deriv(log(P3^n3),
p1)
This produces a very large equation, which I won't reproduce here. Let's say
that the four gradient equations for the four parameters are defined as
gr.p1, gr.p2, gr.mu1, gr.mu2, and all are derived as described above for
gr.p1. These gradient equations are functions of p1, p2, mu1, mu2, t, n1,
n2, and n3. My current gradient function is:
grr <- function(p1, p2, mu1, mu2, y){
t <- y[,1]
n1 <- y[,2]
n2 <- y[,3]
n3 <- y[,4]
gr.p1 <- .......
gr.p2 <- .......
gr.mu1 <- .......
gr.mu2 <- .......
c(gr.p1, gr.p2, gr.mu1, gr.mu2)
}
The problem is that I need to supply values for t, n1, n2, and n3 to the
gradient function, which are from the dataset yt, above. When I supply the
dataset yt, the function produces a vector of length 4*nrow(yt) = 80. When
I include it in my mle2() function, I get an error that mle2 (optim)
requires a vector of length 4. How do I write my gradient function to work
in mle2()?
Any help would be much appreciated.
Adam Zeilinger
--
Adam Zeilinger
Post Doctoral Scholar
Department of Entomology
University of California Riverside
www.linkedin.com/in/adamzeilinger
Hello R Help,
I am trying solve an MLE convergence problem: I would like to estimate four parameters, p1, p2, mu1, mu2, which relate to the probabilities, P1, P2, P3, of a multinomial (trinomial) distribution. I am using the mle2() function and feeding it a time series dataset composed of four columns: time point, number of successes in category 1, number of successes in category 2, and number of success in category 3. The column headers are: t, n1, n2, and n3.
The mle2() function converges occasionally, and I need to improve the rate of convergence when used in a stochastic simulation, with multiple stochastically generated datasets. When mle2() does not converge, it returns an error: "Error in optim(par = c(2, 2, 0.001, 0.001), fn = function (p) : L-BFGS-B needs finite values of 'fn'." I am using the L-BFGS-B optimization method with a lower box constraint of zero for all four parameters. While I do not know any theoretical upper limit(s) to the parameter values, I have not seen any parameter estimates above 2 when using empirical data. It seems that when I start with certain 'true' parameter values, the rate of convergence is quite high, whereas other "true" parameter values are very difficult to estimate. For example, the true parameter values p1 = 2, p2 = 2, mu1 = 0.001, mu2 = 0.001 causes convergence problems, but the parameter values p1 = 0.3, p2 = 0.3, mu1 = 0.08, mu2 = 0.08 lead to high convergence rate. I've chosen these two sets of values because they represent the upper and lower estimates of parameter values derived from graphical methods.
First, do you have any suggestions on how to improve the rate of convergence and avoid the "finite values of 'fn'" error? Perhaps it has to do with the true parameter values being so close to the boundary? If so, any suggestions on how to estimate parameter values that are near zero?
Here is reproducible and relevant code from my stochastic simulation:
########################################################################
library(bbmle)
library(combinat)
# define multinomial distribution
dmnom2 <- function(x,prob,log=FALSE) {
r <- lgamma(sum(x) + 1) + sum(x * log(prob) - lgamma(x + 1))
if (log) r else exp(r)
}
# vector of time points
tv <- 1:20
# Negative log likelihood function
NLL.func <- function(p1, p2, mu1, mu2, y){
t <- y$tv
n1 <- y$n1
n2 <- y$n2
n3 <- y$n3
P1 <- (p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P2 <- (p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P3 <- 1 - P1 - P2
p.all <- c(P1, P2, P3)
-sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE))
}
## Generate simulated data
# Model equations as expressions,
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# True parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
# Generate simulated data set from probabilities
n = rep(20, length(tv))
p = as.matrix(Pdat[,2:4])
y <- as.data.frame(rmultinomial(n,p))
yt <- cbind(tv, y)
names(yt) <- c("tv", "n1", "n2", "n3")
# mle2 call
mle.fit <- mle2(NLL.func, data = list(y = yt),
start = list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t),
control = list(maxit = 5000, factr = 1e-10, lmm = 17),
method = "L-BFGS-B", skip.hessian = TRUE,
lower = list(p1 = 0, p2 = 0, mu1 = 0, mu2 = 0))
###########################################################################
I interpret the error as having to do with the finite difference approximation failing. If so, perhaps a gradient function would help? If you agree, I've described my unsuccessful attempt at writing a gradient function below. If a gradient function is unnecessary, ignore the remainder of this message.
After playing with your function, I can't agree with your interpretation of what could be wrong.
During optim iterations your function is dmnom2 is getting negative values for prob and that leads to the error messages.
I checked this by inserting the following lines in NLL.func after the assignment to p.all:
cat("NLL.func p.all {P1,P2,P3}\n")
print(matrix(p.all, ncol=3))
At some stage entries for P1, P2, P3 become negative (which ones and how many depends on the random number generator).
Try set.seed(1), set.seed(11) and set.seed(413) to see what happens.
The expressions are too complicated for further analysis.
Assuming your expressions are correct, you will need to restrict P1,P2,P3 to take on valid values.
Berend
Dear R Help,
Thank you to those who responded to my questions about mle2/optim
convergence. A few responders pointed out that the optim error seems to
arise when either one of the probabilities P1, P2, or P3 become negative
or infinite. One suggested examining the exponential terms within the
P1 and P2 equations.
I may have made some progress along these lines. The exponential terms
in the equations for P1 and P2 go to infinity at certain (large) values
of t. The exponential terms can be found in lines 1, 5, and 7 in the P1
and P2 expressions below. Here is some example code:
###########################################################################
# P1 and P2 equations
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# Vector of t values
tv <- c(1:200)
# 'true' parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
matplot(Pdat[,-1], type = "l", xlab = "time", ylab = "Probability",
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2, ylim = c(0,1))
legend("topright", c("P1", "P2", "P3"),
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2)
Pdat[160:180,] # psim function begins to return "NaN" at t = 178
# exponential terms in P1 and P2 expressions are problematic
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = 178)
exp1 <- expression(exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))
eval(exp1, params) # returns Inf at t = 178
exp2 <- expression(exp((1/2)*(mu1 + mu2 + p1 + p2 +
sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t))
eval(exp2, params) # also returns Inf at t = 178
##########################################################################
The time step at which the exponential terms go to infinity depends on
the values of the parameters p1, p2, mu1, mu2. It seems that the
convergence problems may be due to these exponential terms going to
infinity. Thus my convergence problem appears to be an overflow problem?
Unfortunately, I'm not sure where to go from here. Due to the
complexity of the P1 and P2 equations, there's no clear way to rearrange
the equations to eliminate t from the exponential terms.
Does anyone have any suggestions on how to address this problem?
Perhaps there is a way to bound p1, p2, mu1, and mu2 to avoid the
exponential terms going to infinity? Or bound P1 and P2?
Any suggestions would be greatly appreciated.
Adam Zeilinger
On 10/5/2012 3:06 AM, Berend Hasselman wrote:
On 05-10-2012, at 07:12, Adam Zeilinger wrote:
Hello R Help,
I am trying solve an MLE convergence problem: I would like to estimate four parameters, p1, p2, mu1, mu2, which relate to the probabilities, P1, P2, P3, of a multinomial (trinomial) distribution. I am using the mle2() function and feeding it a time series dataset composed of four columns: time point, number of successes in category 1, number of successes in category 2, and number of success in category 3. The column headers are: t, n1, n2, and n3.
The mle2() function converges occasionally, and I need to improve the rate of convergence when used in a stochastic simulation, with multiple stochastically generated datasets. When mle2() does not converge, it returns an error: "Error in optim(par = c(2, 2, 0.001, 0.001), fn = function (p) : L-BFGS-B needs finite values of 'fn'." I am using the L-BFGS-B optimization method with a lower box constraint of zero for all four parameters. While I do not know any theoretical upper limit(s) to the parameter values, I have not seen any parameter estimates above 2 when using empirical data. It seems that when I start with certain 'true' parameter values, the rate of convergence is quite high, whereas other "true" parameter values are very difficult to estimate. For example, the true parameter values p1 = 2, p2 = 2, mu1 = 0.001, mu2 = 0.001 causes convergence problems, but the parameter values p1 = 0.3, p2 = 0.3, mu1 = 0.08, mu2 = 0.08 lead to high convergence rate. I've chose!
! n these two sets of values because they represent the upper and lower estimates of parameter values derived from graphical methods.
After playing with your function, I can't agree with your interpretation of what could be wrong.
During optim iterations your function is dmnom2 is getting negative values for prob and that leads to the error messages.
I checked this by inserting the following lines in NLL.func after the assignment to p.all:
cat("NLL.func p.all {P1,P2,P3}\n")
print(matrix(p.all, ncol=3))
At some stage entries for P1, P2, P3 become negative (which ones and how many depends on the random number generator).
Try set.seed(1), set.seed(11) and set.seed(413) to see what happens.
The expressions are too complicated for further analysis.
Assuming your expressions are correct, you will need to restrict P1,P2,P3 to take on valid values.
Berend
Dear R Help,
Thank you to those who responded to my questions about mle2/optim convergence. A few responders pointed out that the optim error seems to arise when either one of the probabilities P1, P2, or P3 become negative or infinite. One suggested examining the exponential terms within the P1 and P2 equations.
I may have made some progress along these lines. The exponential terms in the equations for P1 and P2 go to infinity at certain (large) values of t. The exponential terms can be found in lines 1, 5, and 7 in the P1 and P2 expressions below. Here is some example code:
###########################################################################
# P1 and P2 equations
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# Vector of t values
tv <- c(1:200)
# 'true' parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
matplot(Pdat[,-1], type = "l", xlab = "time", ylab = "Probability",
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2, ylim = c(0,1))
legend("topright", c("P1", "P2", "P3"),
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2)
Pdat[160:180,] # psim function begins to return "NaN" at t = 178
# exponential terms in P1 and P2 expressions are problematic
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = 178)
exp1 <- expression(exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))
eval(exp1, params) # returns Inf at t = 178
exp2 <- expression(exp((1/2)*(mu1 + mu2 + p1 + p2 +
sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t))
eval(exp2, params) # also returns Inf at t = 178
##########################################################################
The time step at which the exponential terms go to infinity depends on the values of the parameters p1, p2, mu1, mu2. It seems that the convergence problems may be due to these exponential terms going to infinity. Thus my convergence problem appears to be an overflow problem?
It is quite possible that letting t becoming larger and larger is going to get you into trouble here,
But the maximum value of t in your initial post was 20.
So it's unlikely that the value of t was causing the problems with the calculation of the gradient (error message "non-finite finite-difference value [3]").
If you change the lower bounds for mu1 and mu2 to something slightly larger than 0 mle2 converges. Because now optim has enough leeway to calculate a finite difference. At least I think that is the cause of your initial problem.
Unfortunately, I'm not sure where to go from here. Due to the complexity of the P1 and P2 equations, there's no clear way to rearrange the equations to eliminate t from the exponential terms.
Does anyone have any suggestions on how to address this problem? Perhaps there is a way to bound p1, p2, mu1, and mu2 to avoid the exponential terms going to infinity? Or bound P1 and P2?
If you were to transform P1, P2 and P3 to something between 0 and 1 you would be changing the model.
Berend
Dear R help,
Thanks again for the responses. I increased the lower constraint to:
lower = list(p1 = 0.0001, p2 = 0.0001, mu1 = 0.0001, mu2 = 0.0001).
I also included an upper box constraint of:
upper = list(p1 = Inf, p2 = Inf, mu1 = p1t, mu2 = p2t).
Making these changes improved the rate of convergence among stochastic
simulation runs, but I still had convergence problems.
I found success in switching from mle2/optim to spg (BB package). So
far, spg has produced similarly precise estimates as L-BFGS-B and
consistently provides parameter estimates.
If anyone is interested, here is the new objective function and spg
call, instead of my previous objective function and mle2 call. All
other parts of my reproducible code are the same as I've previously
supplied:
######################################################################
library(BB)
# Objective function for spg()
NLL2 <- function(par, y){
p1 <- par[1]
p2 <- par[2]
mu1 <- par[3]
mu2 <- par[4]
t <- y$tv
n1 <- y$n1
n2 <- y$n2
n3 <- y$n3
P1 <- (p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P2 <- (p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P3 <- 1 - P1 - P2
p.all <- c(P1, P2, P3)
#cat("NLL.free p.all {P1,P2,P3}\n")
#print(matrix(p.all, ncol=3))
-sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE))
}
par <- c(p1t, p2t, mu1t, mu2t)
spg.fit <- spg(par = par, fn = NLL2, y = yt,
lower = c(0.001, 0.001, 0.001, 0.001),
control = list(maxit = 5000))
########################################################################
My next problem is that spg takes about twice as long as L-BFGS-B to
converge. The spg help file strongly suggests the use of an exact
gradient function to improve speed. But I am having trouble writing a
gradient function. Here is what I have so far:
I derived the gradient function by taking the derivative of my NLL
equation with respect to each parameter. My NLL equation is the
probability mass function of the trinomial distribution. Here is some
reproducible code:
#########################################################################
library(Ryacas)
p1 <- Sym("p1"); p2 <- Sym("p2"); mu1 <- Sym("mu1"); mu2 <- Sym("mu2")
t <- Sym("t"); n1 <- Sym("n1"); n2 <- Sym("n2"); n3 <- Sym("n3")
P1.symb <- ((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2.symb <- ((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P3.symb <- 1 - P1.symb - P2.symb
# gradient equation with respect to parameter p1 for probability mass
# function of the trinomial distribution with probabilities P1, P2, P3
gr.p1 <- deriv(log(P1.symb^n1), p1) + deriv(log(P2.symb^n2), p1) +
deriv(log(P3.symb^n3), p1)
######################################################################
gr.p1 is very large equation, which I won't reproduce here. Let's say
that the four gradient equations for the four parameters are defined as
gr.p1, gr.p2, gr.mu1, gr.mu2, and all are derived as described above for
gr.p1. These gradient equations are functions of p1, p2, mu1, mu2, t,
n1, n2, and n3. My current gradient function is:
grr <- function(par, y){
p1 <- par[1]
p2 <- par[2]
mu1 <- par[3]
mu2 <- par[4]
t <- y[,1]
n1 <- y[,2]
n2 <- y[,3]
n3 <- y[,4]
gr.p1 <- ....
gr.p2 <- ....
gr.mu1 <- ....
gr.mu2 <- ....
c(gr.p1, gr.p2, gr.mu1, gr.mu2)
}
The problem is that I need to supply values for t, n1, n2, and n3 to the
gradient function, which are from the dataset yt, above. When I supply
the dataset yt, the grr function produces a vector of length 4*nrow(yt)
= 80. However, spg requires a gradient function that returns a vector
of length(par) = 4. When I include this gradient function in my spg
function, I get an error that the gradient function is incorrect.
Does anyone have any suggestions on how to write my gradient function?
Am I calculating the gradient equation, gr.p1 incorrectly? As always,
any help would be much appreciated.
Adam Zeilinger
On 10/8/2012 3:44 AM, Berend Hasselman wrote:
See below.
On 08-10-2012, at 07:39, Adam Zeilinger wrote:
Dear R Help,
Thank you to those who responded to my questions about mle2/optim convergence. A few responders pointed out that the optim error seems to arise when either one of the probabilities P1, P2, or P3 become negative or infinite. One suggested examining the exponential terms within the P1 and P2 equations.
I may have made some progress along these lines. The exponential terms in the equations for P1 and P2 go to infinity at certain (large) values of t. The exponential terms can be found in lines 1, 5, and 7 in the P1 and P2 expressions below. Here is some example code:
###########################################################################
# P1 and P2 equations
P1 <- expression((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2 <- expression((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
# Vector of t values
tv <- c(1:200)
# 'true' parameter values
p1t = 2; p2t = 2; mu1t = 0.001; mu2t = 0.001
# Function to calculate probabilities from 'true' parameter values
psim <- function(x){
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = x)
eval.P1 <- eval(P1, params)
eval.P2 <- eval(P2, params)
P3 <- 1 - eval.P1 - eval.P2
c(x, matrix(c(eval.P1, eval.P2, P3), ncol = 3))
}
pdat <- sapply(tv, psim, simplify = TRUE)
Pdat <- as.data.frame(t(pdat))
names(Pdat) <- c("time", "P1", "P2", "P3")
matplot(Pdat[,-1], type = "l", xlab = "time", ylab = "Probability",
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2, ylim = c(0,1))
legend("topright", c("P1", "P2", "P3"),
col = c("black", "brown", "blue"),
lty = c(1:3), lwd = 2)
Pdat[160:180,] # psim function begins to return "NaN" at t = 178
# exponential terms in P1 and P2 expressions are problematic
params <- list(p1 = p1t, p2 = p2t, mu1 = mu1t, mu2 = mu2t, t = 178)
exp1 <- expression(exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))
eval(exp1, params) # returns Inf at t = 178
exp2 <- expression(exp((1/2)*(mu1 + mu2 + p1 + p2 +
sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t))
eval(exp2, params) # also returns Inf at t = 178
##########################################################################
The time step at which the exponential terms go to infinity depends on the values of the parameters p1, p2, mu1, mu2. It seems that the convergence problems may be due to these exponential terms going to infinity. Thus my convergence problem appears to be an overflow problem?
It is quite possible that letting t becoming larger and larger is going to get you into trouble here,
But the maximum value of t in your initial post was 20.
So it's unlikely that the value of t was causing the problems with the calculation of the gradient (error message "non-finite finite-difference value [3]").
If you change the lower bounds for mu1 and mu2 to something slightly larger than 0 mle2 converges. Because now optim has enough leeway to calculate a finite difference. At least I think that is the cause of your initial problem.
Unfortunately, I'm not sure where to go from here. Due to the complexity of the P1 and P2 equations, there's no clear way to rearrange the equations to eliminate t from the exponential terms.
Does anyone have any suggestions on how to address this problem? Perhaps there is a way to bound p1, p2, mu1, and mu2 to avoid the exponential terms going to infinity? Or bound P1 and P2?
If you were to transform P1, P2 and P3 to something between 0 and 1 you would be changing the model.
Berend
Dear R help,
Thanks again for the responses. I increased the lower constraint to:
lower = list(p1 = 0.0001, p2 = 0.0001, mu1 = 0.0001, mu2 = 0.0001).
I also included an upper box constraint of:
upper = list(p1 = Inf, p2 = Inf, mu1 = p1t, mu2 = p2t).
Making these changes improved the rate of convergence among stochastic simulation runs, but I still had convergence problems.
I found success in switching from mle2/optim to spg (BB package). So far, spg has produced similarly precise estimates as L-BFGS-B and consistently provides parameter estimates.
If anyone is interested, here is the new objective function and spg call, instead of my previous objective function and mle2 call. All other parts of my reproducible code are the same as I've previously supplied:
######################################################################
library(BB)
# Objective function for spg()
NLL2 <- function(par, y){
p1 <- par[1]
p2 <- par[2]
mu1 <- par[3]
mu2 <- par[4]
t <- y$tv
n1 <- y$n1
n2 <- y$n2
n3 <- y$n3
P1 <- (p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P2 <- (p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))))
P3 <- 1 - P1 - P2
p.all <- c(P1, P2, P3)
#cat("NLL.free p.all {P1,P2,P3}\n")
#print(matrix(p.all, ncol=3))
-sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE))
}
par <- c(p1t, p2t, mu1t, mu2t)
spg.fit <- spg(par = par, fn = NLL2, y = yt,
lower = c(0.001, 0.001, 0.001, 0.001),
control = list(maxit = 5000))
########################################################################
My next problem is that spg takes about twice as long as L-BFGS-B to converge. The spg help file strongly suggests the use of an exact gradient function to improve speed. But I am having trouble writing a gradient function. Here is what I have so far:
I derived the gradient function by taking the derivative of my NLL equation with respect to each parameter. My NLL equation is the probability mass function of the trinomial distribution. Here is some reproducible code:
#########################################################################
library(Ryacas)
p1 <- Sym("p1"); p2 <- Sym("p2"); mu1 <- Sym("mu1"); mu2 <- Sym("mu2")
t <- Sym("t"); n1 <- Sym("n1"); n2 <- Sym("n2"); n3 <- Sym("n3")
P1.symb <- ((p1*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*((-mu2)*(mu2 - p1 + p2) +
mu1*(mu2 + 2*p2)) - mu2*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu2*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu2*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P2.symb <- ((p2*((-1 + exp(sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2)))*t))*(-mu1^2 + 2*mu2*p1 +
mu1*(mu2 - p1 + p2)) - mu1*sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))) -
exp(sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))*t)*
mu1*sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2))) +
2*exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)*mu1*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))/
exp((1/2)*(mu1 + mu2 + p1 + p2 + sqrt((mu1 + mu2 + p1 + p2)^2 -
4*(mu2*p1 + mu1*(mu2 + p2))))*t)/(2*(mu2*p1 + mu1*(mu2 + p2))*
sqrt((mu1 + mu2 + p1 + p2)^2 - 4*(mu2*p1 + mu1*(mu2 + p2)))))
P3.symb <- 1 - P1.symb - P2.symb
# gradient equation with respect to parameter p1 for probability mass
# function of the trinomial distribution with probabilities P1, P2, P3
gr.p1 <- deriv(log(P1.symb^n1), p1) + deriv(log(P2.symb^n2), p1) + deriv(log(P3.symb^n3), p1)
######################################################################
gr.p1 is very large equation, which I won't reproduce here. Let's say that the four gradient equations for the four parameters are defined as gr.p1, gr.p2, gr.mu1, gr.mu2, and all are derived as described above for gr.p1. These gradient equations are functions of p1, p2, mu1, mu2, t, n1, n2, and n3. My current gradient function is:
grr <- function(par, y){
p1 <- par[1]
p2 <- par[2]
mu1 <- par[3]
mu2 <- par[4]
t <- y[,1]
n1 <- y[,2]
n2 <- y[,3]
n3 <- y[,4]
gr.p1 <- ....
gr.p2 <- ....
gr.mu1 <- ....
gr.mu2 <- ....
c(gr.p1, gr.p2, gr.mu1, gr.mu2)
}
The problem is that I need to supply values for t, n1, n2, and n3 to the gradient function, which are from the dataset yt, above. When I supply the dataset yt, the grr function produces a vector of length 4*nrow(yt) = 80. However, spg requires a gradient function that returns a vector of length(par) = 4. When I include this gradient function in my spg function, I get an error that the gradient function is incorrect.
Does anyone have any suggestions on how to write my gradient function?
Your grr function is not returning the gradient of NLL2 w.r.t. the parameters p1,p2,mu1,mu2.
You have 20 time periods and 4 parameters. Your grr is returning a single vector containing the gradients stacked (length=4*20).
You need to return the gradient of -sum(dmnom2(c(n1, n2, n3), prob = p.all, log = TRUE)) with respect to those parameters.
That would be a vector of length 4.
Am I calculating the gradient equation, gr.p1 incorrectly?