We will model the time evolution of an algal sample taken in the Adriatic Sea (Cavallini, 1993). Time (\(x\)) is expressed in days and biomass (\(y\)), which is a measure of growth, is measured in mm2 (what is actually measured is the surface covered by biomass in a microscopic sample). The data seem to follow a logistic curve: \[y_i = \frac{\theta_0}{1 + \exp (-\theta_1(x_i - \theta_2))}\]
algal <- data.frame(x = c(11, 15, 18, 23, 26, 31, 39, 44, 54, 64, 74),
y = c(.0048, .0105, .0207, .0619, .3370,
.7400, 1.7, 2.45, 3.5, 4.5, 5.09))
kable(algal, row.names = T)
x | y | |
---|---|---|
1 | 11 | 0.0048 |
2 | 15 | 0.0105 |
3 | 18 | 0.0207 |
4 | 23 | 0.0619 |
5 | 26 | 0.3370 |
6 | 31 | 0.7400 |
7 | 39 | 1.7000 |
8 | 44 | 2.4500 |
9 | 54 | 3.5000 |
10 | 64 | 4.5000 |
11 | 74 | 5.0900 |
require(ggplot2)
p <- ggplot(data = algal, aes(x, y)) + geom_point()
p
In R, we can perform the grid search and find the starting value manually.
# setup grid
grid <- expand.grid(theta0 = seq(0, 10, 2),
theta1 = seq(0, .5, .1),
theta2 = seq(40, 60, 5))
kable(head(grid))
theta0 | theta1 | theta2 |
---|---|---|
0 | 0 | 40 |
2 | 0 | 40 |
4 | 0 | 40 |
6 | 0 | 40 |
8 | 0 | 40 |
10 | 0 | 40 |
We specify the nonlinear model and compute the sum of squares of error manually.
nlmodel <- function(x, theta) {
theta[1] / (1 + exp(-theta[2] * (x - theta[3])))
}
sse <- apply(grid, 1, function(theta) {
sum((algal$y - nlmodel(algal$x, theta))^2)
})
kable(head(data.frame(grid, sse))) # only the head of the table is shown
theta0 | theta1 | theta2 | sse |
---|---|---|---|
0 | 0 | 40 | 67.96616 |
2 | 0 | 40 | 42.13636 |
4 | 0 | 40 | 38.30656 |
6 | 0 | 40 | 56.47676 |
8 | 0 | 40 | 96.64696 |
10 | 0 | 40 | 158.81716 |
Final starting values:
grid[which.min(sse), ]
## theta0 theta1 theta2
## 82 6 0.1 50
nls_fit <- nls(y ~ nlmodel(x, theta), data = algal, trace = T,
start = list(theta = c(6, .1, 50)))
## 0.6350226 : 6.0 0.1 50.0
## 0.2857354 : 5.0141669 0.1157241 45.8625826
## 0.2384547 : 5.0887283 0.1221786 45.7619359
## 0.2381781 : 5.0956237 0.1211468 45.7748836
## 0.2381669 : 5.0947305 0.1213394 45.7743463
## 0.2381665 : 5.0949589 0.1213007 45.7748138
## 0.2381664 : 5.0949235 0.1213077 45.7747829
## 0.2381664 : 5.0949315 0.1213063 45.7747980
## 0.2381664 : 5.0949302 0.1213066 45.7747966
nls_fit
## Nonlinear regression model
## model: y ~ nlmodel(x, theta)
## data: algal
## theta1 theta2 theta3
## 5.0949 0.1213 45.7748
## residual sum-of-squares: 0.2382
##
## Number of iterations to convergence: 8
## Achieved convergence tolerance: 1.925e-06
sm <- summary(nls_fit, correlation = T)
sm$coefficients
## Estimate Std. Error t value Pr(>|t|)
## theta1 5.0949302 0.19766239 25.77592 5.505407e-09
## theta2 0.1213066 0.01159284 10.46393 6.044273e-06
## theta3 45.7747966 1.17097569 39.09116 2.015699e-10
sm$correlation
## theta1 theta2 theta3
## theta1 1.0000000 -0.6865027 0.8166978
## theta2 -0.6865027 1.0000000 -0.6528791
## theta3 0.8166978 -0.6528791 1.0000000
# the output shows theta 1-3, it's actually theta 0-2
mapply(function(est, se) est + qt(c(.025, .975), df.residual(nls_fit)) * se,
sm$coefficients[, 1], sm$coefficients[, 2], SIMPLIFY = F)
## $theta1
## [1] 4.63912 5.55074
##
## $theta2
## [1] 0.09457346 0.14803972
##
## $theta3
## [1] 43.07452 48.47507
y0 <- fitted(nls_fit)
cf <- cf <- nls_fit$m$gradient()
se <- apply(t(cf), 2, function(f0) {
sm$sigma * {t(f0) %*% sm$cov.unscaled %*% f0}^(.5)
})
ll <- fitted(nls_fit) + qt(.025, df.residual(nls_fit)) * se
ul <- fitted(nls_fit) + qt(.975, df.residual(nls_fit)) * se
ci <- data.frame(algal, ll, ul)
# prediction intervals are similar
se <- apply(t(cf), 2, function(f0) {
sm$sigma * {1 + t(f0) %*% sm$cov.unscaled %*% f0}^(.5)
})
ll <- fitted(nls_fit) + qt(.025, df.residual(nls_fit)) * se
ul <- fitted(nls_fit) + qt(.975, df.residual(nls_fit)) * se
pi <- data.frame(algal, ll, ul)
p + stat_function(fun = nlmodel, args = list(coef(nls_fit))) +
geom_smooth(aes(ymin = ll, ymax = ul), data = ci, stat="identity") +
geom_smooth(aes(ymin = ll, ymax = ul), data = pi, stat="identity")