Statistics for Laboratory Scientists ( 140.615 )

Regression and Correlation - Advanced

Loading the SPH.140.615 package.

library(SPH.140.615)

Fathers’ and daughters’ heights

The Pearson & Lee (1906) data.

example(pear)
## 
## pear> str(pear)
## 'data.frame':    1376 obs. of  2 variables:
##  $ father  : num  63.6 64 65.5 58.8 59.4 62.5 62.9 65.7 67.3 58.7 ...
##  $ daughter: num  52.6 53.9 55.8 56.2 56.1 56.1 56.9 56.3 56.7 57.2 ...
## 
## pear> summary(pear)
##      father         daughter    
##  Min.   :58.40   Min.   :52.60  
##  1st Qu.:65.80   1st Qu.:62.10  
##  Median :67.80   Median :63.80  
##  Mean   :67.68   Mean   :63.84  
##  3rd Qu.:69.60   3rd Qu.:65.60  
##  Max.   :76.00   Max.   :72.60

Calculate the regression of daughter’s height on father’s height (i.e., for predicting daughter from father).

lm.outA <- lm(daughter ~ father, data=pear)
summary(lm.outA)
## 
## Call:
## lm(formula = daughter ~ father, data = pear)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2148 -1.4814  0.0221  1.4914  8.3047 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.26347    1.49937   20.18   <2e-16 ***
## father       0.49609    0.02214   22.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.274 on 1374 degrees of freedom
## Multiple R-squared:  0.2677, Adjusted R-squared:  0.2672 
## F-statistic: 502.3 on 1 and 1374 DF,  p-value: < 2.2e-16

Calculate the regression of father’s height on daughter’s height (i.e., for predicting father from daughter).

lm.outB <- lm(father ~ daughter, data=pear)
summary(lm.outB)
## 
## Call:
## lm(formula = father ~ daughter, data = pear)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8487 -1.5024  0.0714  1.6102  7.5563 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.23102    1.53840   21.60   <2e-16 ***
## daughter     0.53961    0.02408   22.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.371 on 1374 degrees of freedom
## Multiple R-squared:  0.2677, Adjusted R-squared:  0.2672 
## F-statistic: 502.3 on 1 and 1374 DF,  p-value: < 2.2e-16

The intercept and slope for regression A.

coA <- lm.outA$coef
coA
## (Intercept)      father 
##  30.2634711   0.4960904

The intercept and slope for regression B.

coB <- lm.outB$coef 
coB
## (Intercept)    daughter 
##   33.231025    0.539609

Transform regression B coefficients: y = mx + b \(\Rightarrow\) x = y/m - b/m

coB[1] <- -coB[1]/coB[2]
coB[2] <- 1/coB[2]
coB
## (Intercept)    daughter 
##  -61.583525    1.853194

Plot the data with the two regression lines.

plot(pear)
abline(coA, lwd=2, col="green")
abline(coB, lwd=2, col="orange")

Span and height example

The data.

plot(span, xlab="span [ inches ]", ylab="height [ inches ]")
abline(lsfit(span$span,span$stature), col="red", lty=2, lwd=2)

Predicting height from span.

lm.fit <- lm(stature~span, data=span)
summary(lm.fit)
## 
## Call:
## lm(formula = stature ~ span, data = span)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3080 -1.1447 -0.0186  1.1502  7.5810 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21.70025    1.13670   19.09   <2e-16 ***
## span         0.66939    0.01653   40.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.707 on 1048 degrees of freedom
## Multiple R-squared:  0.6101, Adjusted R-squared:  0.6097 
## F-statistic:  1640 on 1 and 1048 DF,  p-value: < 2.2e-16
summary(lm.fit)$coef[,1]
## (Intercept)        span 
##  21.7002483   0.6693937

Residual standard deviation.

summary(lm.fit)$sigma
## [1] 1.707371

The correlation.

r <- cor(span$span, span$stature)
r
## [1] 0.7810676
cor(span)
##              span   stature
## span    1.0000000 0.7810676
## stature 0.7810676 1.0000000

Slope of the regression line.

r * sd(span$stature)/sd(span$span)
## [1] 0.6693937

Height standard deviation.

sd(span$stature)
## [1] 2.732911

Typical prediction error using span to predict height.

sd(span$stature) * sqrt(1-r^2)
## [1] 1.706557