class: center, middle, inverse, title-slide # POL90: Applied Quantitative Analysis ## Chapter 7: Simple Linear Regression ### Prof WasowAssistant Professor, Politics ### 2022-03-07 --- <style type="text/css"> .regression10 table { font-size: 10px; } .regression12 table { font-size: 12px; } .regression14 table { font-size: 14px; } </style> # Announcements .large[ * Assignments + Report 1 ] -- .large[ * Statistical Sleuth + Read Chapter 7 + Supplement - http://appliedstats.org/chapter7.html ] --- # Schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Title </th> <th style="text-align:right;"> Chapter </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Feb 21 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Alternatives to the t-Tools </td> <td style="text-align:right;"> 4 </td> </tr> <tr> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Feb 23 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Comparison Among Several Samples </td> <td style="text-align:right;"> 5 </td> </tr> <tr> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Feb 28 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Comparison Among Several Samples </td> <td style="text-align:right;"> 5 </td> </tr> <tr> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Mar 2 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Simple Linear Regression </td> <td style="text-align:right;"> 7 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 8 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mar 7 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mon </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Simple Linear Regression </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 7 </td> </tr> <tr> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Mar 9 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Regression by Calculation </td> <td style="text-align:right;"> 7 </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 14 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Spring Recess </td> <td style="text-align:right;"> - </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 16 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Spring Recess </td> <td style="text-align:right;"> - </td> </tr> <tr> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Mar 21 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Null hypothesis, R-squared </td> <td style="text-align:right;"> 8 </td> </tr> <tr> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Mar 23 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Multiple regression </td> <td style="text-align:right;"> 8 </td> </tr> </tbody> </table> --- ## Assignment schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Assignment </th> <th style="text-align:right;"> Percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Mar 4 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Report1 </td> <td style="text-align:right;"> 6 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 8 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mar 11 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Fri </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> PS06 </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 18 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Spring break </td> <td style="text-align:right;"> NA </td> </tr> <tr> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Mar 25 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS07 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Apr 1 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS08 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Apr 8 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Report2 </td> <td style="text-align:right;"> 8 </td> </tr> <tr> <td style="text-align:right;"> 13 </td> <td style="text-align:left;"> Apr 15 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS09 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> Apr 22 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS10 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> Apr 29 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Report3 </td> <td style="text-align:right;"> 10 </td> </tr> <tr> <td style="text-align:right;"> 16 </td> <td style="text-align:left;"> May 6 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> NA </td> <td style="text-align:right;"> NA </td> </tr> </tbody> </table> --- class: middle, center # Simple Linear Regression with # Sunspot Activity & Skin Cancer --- ## Sunspot Activity & Skin Cancer .large[ - Data - `cancer_rate` indicates yearly skin cancer rates (per 100,000 people) in Connecticut from 1938 to 1972 - `sunspot_activity` indicates those years that came two years after higher than average sunspot activity and those years that came two years after lower than average sunspot activity. - `year` - Source: *Statistical Sleuth*, Chapter 3, Problem 23. Data from D. F. Andrews and A. M. Herzberg, Data, New York: Springer-Verlag, 1985. ] --- ## Load data ```r solar <- Sleuth3::ex0323 %>% clean_names() dim(solar) ``` ``` [1] 35 3 ``` ```r head(solar, 3) ``` ``` year cancer_rate sunspot_activity 1 1938 0.8 Low 2 1939 1.3 High 3 1940 1.4 High ``` ```r tail(solar, 3) ``` ``` year cancer_rate sunspot_activity 33 1970 4.8 High 34 1971 4.8 High 35 1972 4.8 High ``` --- ## Plot data ```r solar %>% ggplot() + aes(y = cancer_rate, x = sunspot_activity) + geom_boxplot() ``` <img src="week08_01_files/figure-html/unnamed-chunk-5-1.png" width="432" style="display: block; margin: auto;" /> --- ## Test for Difference in Means ```r t.test(cancer_rate ~ sunspot_activity, data = solar, var.equal = TRUE) ``` ``` Two Sample t-test data: cancer_rate by sunspot_activity *t = 1.1, df = 33, p-value = 0.3 alternative hypothesis: true difference in means between group High and group Low is not equal to 0 95 percent confidence interval: -0.3724 1.2591 sample estimates: mean in group High mean in group Low 3.173 2.730 ``` -- ```r mean_low <- mean(solar$cancer_rate[solar$sunspot_activity == "Low"]) mean_high <- mean(solar$cancer_rate[solar$sunspot_activity == "High"]) mean_high - mean_low ``` ``` [1] 0.4433 ``` --- ## Visualizing Differeince in Means <img src="week08_01_files/figure-html/unnamed-chunk-8-1.png" width="80%" style="display: block; margin: auto;" /> --- class: middle, center # Simple Regression: # Dummy Variables --- ## Single Mean Model, No Slope ```r # single mean model (one intercept) lm_mean <- lm(cancer_rate ~ 1, data = solar) summary(lm_mean) ``` ``` Call: lm(formula = cancer_rate ~ 1, data = solar) Residuals: Min 1Q Median 3Q Max -2.12 -1.02 -0.02 0.93 1.88 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 2.920 0.199 14.7 2.9e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.18 on 34 degrees of freedom ``` --- ## Single Mean Model, No Slope <br><br><br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate}\} & = & \beta_{0} \\ \mu\{\operatorname{cancer\_rate}\} & = & 2.92 \end{eqnarray*}` --- ## Single Mean Model, Intercept, No Slope <img src="week08_01_files/figure-html/unnamed-chunk-11-1.png" width="90%" style="display: block; margin: auto;" /> --- ## One Intercept, One Dummy, No Slope ```r # two mean model (two intercepts) lm_sunspot <- lm(cancer_rate ~ sunspot_activity, data = solar) summary(lm_sunspot) ``` ``` Call: lm(formula = cancer_rate ~ sunspot_activity, data = solar) Residuals: Min 1Q Median 3Q Max -1.973 -0.830 -0.130 0.998 1.970 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 3.173 0.303 10.47 5.1e-12 *** *sunspot_activityLow -0.443 0.401 -1.11 0.28 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.17 on 33 degrees of freedom Multiple R-squared: 0.0357, Adjusted R-squared: 0.0065 F-statistic: 1.22 on 1 and 33 DF, p-value: 0.277 ``` --- ## Equation Intercept, Dummy, No Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & \beta_{0} + \beta_{1}(\operatorname{sunspot\_activity}_{\operatorname{Low}}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(\operatorname{sunspot\_activity}_{\operatorname{Low}}) \end{eqnarray*}` -- - .large[Example: Sunspot Activity High] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(0) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 \end{eqnarray*}` -- - .large[Example: Sunspot Activity Low] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(1) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 2.73 \end{eqnarray*}` --- ## Plot of Intercept, Dummy, No Slope <img src="week08_01_files/figure-html/unnamed-chunk-14-1.png" width="90%" style="display: block; margin: auto;" /> --- class: middle, center # Simple Regression: # Conditional Mean --- ## Conditional Mean, Intercept and Slope ```r # single conditional model (one slope, one intercept) lm_year <- lm(cancer_rate ~ year, data = solar) summary(lm_year) ``` ``` Call: lm(formula = cancer_rate ~ year, data = solar) Residuals: Min 1Q Median 3Q Max -0.61 -0.23 -0.02 0.22 0.73 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) -2.12e+02 1.13e+01 -18.8 <2e-16 *** *year 1.10e-01 5.78e-03 19.0 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.345 on 33 degrees of freedom Multiple R-squared: 0.917, Adjusted R-squared: 0.914 F-statistic: 362 on 1 and 33 DF, p-value: <2e-16 ``` --- ## Conditional Mean Model, Intercept + Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & \beta_{0} + \beta_{1}(\operatorname{year}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(\operatorname{year}) \end{eqnarray*}` -- - .large[Example: 1960] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(1960) \\ & = & -212.18 + 215.6 \\ & = & 3.42 \\ \end{eqnarray*}` -- - .large[Example: 1961] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(1961) \\ & = & 3.53 \\ \end{eqnarray*}` --- ## Conditional Mean, Intercept and Slope <img src="week08_01_files/figure-html/unnamed-chunk-16-1.png" width="90%" style="display: block; margin: auto;" /> --- class: middle, center # Multiple Regression --- ## Multiple Regression: Intercept, Dummy and Slope ```r # single conditional mean + mean shift (one slope, two intercepts) lm_sunspot_and_year <- lm(cancer_rate ~ sunspot_activity + year, data = solar) summary(lm_sunspot_and_year) ``` ``` Call: lm(formula = cancer_rate ~ sunspot_activity + year, data = solar) Residuals: Min 1Q Median 3Q Max -0.4329 -0.2365 -0.0203 0.1648 0.5396 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) -2.12e+02 8.68e+00 -24.41 < 2e-16 *** *sunspot_activityLow -4.43e-01 9.06e-02 -4.89 2.7e-05 *** *year 1.10e-01 4.44e-03 24.78 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.265 on 32 degrees of freedom Multiple R-squared: 0.952, Adjusted R-squared: 0.949 F-statistic: 319 on 2 and 32 DF, p-value: <2e-16 ``` --- ## Conditional Mean, Intercept, Dummy and Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X_1) + \beta_2(X_2) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & \beta_{0} + \beta_{1}(\operatorname{sunspot}_{\operatorname{Low}}) + \beta_{2}(\operatorname{year}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(\operatorname{sunspot}_{\operatorname{Low}}) + \\ & & + 0.11(\operatorname{year}) \end{eqnarray*}` -- - .large[Example: year = 1957, sunspot = Low] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(1) + 0.11(1957) \\ & = & 2.95 \end{eqnarray*}` -- - .large[Example: year = 1960, sunspot = High] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(0) + 0.11(1960) \\ & = & 3.73 \end{eqnarray*}` --- ## Conditional Mean, Intercept, Dummy and Slope <img src="week08_01_files/figure-html/unnamed-chunk-18-1.png" width="90%" style="display: block; margin: auto;" /> --- class: regression14 ## Look at table of results ```r stargazer(lm_mean, lm_sunspot, lm_year, lm_sunspot_and_year, type = "html", header = FALSE, digits = 2, omit.stat = c("f", "ser", "rsq", "adj.rsq"), single.row = TRUE, intercept.bottom = FALSE ) ``` <table style="text-align:center"><tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="4"><em>Dependent variable:</em></td></tr> <tr><td></td><td colspan="4" style="border-bottom: 1px solid black"></td></tr> <tr><td style="text-align:left"></td><td colspan="4">cancer_rate</td></tr> <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td><td>(4)</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Constant</td><td>2.92<sup>***</sup> (0.20)</td><td>3.17<sup>***</sup> (0.30)</td><td>-212.20<sup>***</sup> (11.30)</td><td>-211.90<sup>***</sup> (8.68)</td></tr> <tr><td style="text-align:left">sunspot_activityLow</td><td></td><td>-0.44 (0.40)</td><td></td><td>-0.44<sup>***</sup> (0.09)</td></tr> <tr><td style="text-align:left">year</td><td></td><td></td><td>0.11<sup>***</sup> (0.01)</td><td>0.11<sup>***</sup> (0.004)</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>35</td><td>35</td><td>35</td><td>35</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="4" style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr> </table> --- ## Flocabulary example: Linear equations .center[ <img src="images/flocabulary_screenshot.jpg" width="85%" style="display: block; margin: auto;" /> ] .footnote[https://www.flocabulary.com/unit/linear-equations/] --- class: middle, center # Chapter 7: Meat Processing and PH Level --- ## Meat Processing and PH Level - Data: - 10 steer carcasses were assigned to be measured for pH at one of five times after slaughter. - Research Questions: - A certain kind of meat processing may begin once the pH in postmortem muscle of a steer carcass decreases to 6.0 from a pH at time of slaughter around 7.0 to 7.2. An estimate is needed of the time after slaughter at which the pH reaches to 6.0. - Summary of Statistical Findings: - It is estimated that the mean pH at 3.9 hours is 6. It is predicted that at least 95% of steer carcasses will reach a pH of 6.0 sometime between 2.94 and 5.10 hours after slaughter. --- ## Meat Processing and PH Level ```r meat <- Sleuth3::case0702 meat ``` ``` Time pH 1 1 7.02 2 1 6.93 3 2 6.42 4 2 6.51 5 4 6.07 6 4 5.99 7 6 5.59 8 6 5.80 9 8 5.51 10 8 5.36 ``` --- ```r meat %>% ggplot() + aes(x = Time, y = pH) + geom_point() + * geom_smooth(method = "loess", se = FALSE) ``` <img src="week08_01_files/figure-html/unnamed-chunk-23-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat <- meat %>% mutate(log_time = log(Time)) meat %>% ggplot() + aes(x = log_time, y = pH) + geom_point() + geom_smooth(method = "loess", se = FALSE) ``` <img src="week08_01_files/figure-html/unnamed-chunk-24-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat %>% ggplot() + aes(x = log_time, y = pH) + geom_point() + * geom_smooth(method = "lm", se = FALSE) ``` <img src="week08_01_files/figure-html/unnamed-chunk-25-1.png" width="792" style="display: block; margin: auto;" /> --- ## Meat Processing and PH Level .center[ <img src="images/ss_display_7_4.png" width="85%" style="display: block; margin: auto;" /> ] .footnote[Source: *Statistical Sleuth*, 3e, Display 7.4] --- ## What do we mean by Std. Error? - What do Estimate and Std. Error below mean? ```r # base R regression slope lm(formula = pH ~ log_time, data = meat) %>% summary() ``` ``` Call: lm(formula = pH ~ log_time, data = meat) Residuals: Min 1Q Median 3Q Max -0.1147 -0.0589 0.0209 0.0361 0.1166 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.9836 0.0485 143.9 6.1e-15 *** *log_time -0.7257 0.0344 -21.1 2.7e-08 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.0823 on 8 degrees of freedom Multiple R-squared: 0.982, Adjusted R-squared: 0.98 F-statistic: 444 on 1 and 8 DF, p-value: 2.7e-08 ``` --- ## Sampling distribution for `\(\beta_1\)` in linear regression - Imagine we were able to draw new samples of 10 carcasses, repeatedly - Imagine we calculated a slope each time - Imagine we plotted the distribution of slopes - That sampling distribution has a center and spread </br></br> .center[ <img src="images/ss_display_7_7_modified.png" width="80%" style="display: block; margin: auto;" /> ] .footnote[Source: *Statistical Sleuth*, 3e, Display 7.7] --- class: middle, center # Bootstrapping with # Linear Models --- ## Why are we bootstrapping now? - Develop an intuition for standard error - What does Std. Error = 0.03443 below mean? ```r # base R regression slope lm(formula = pH ~ log_time, data = meat) %>% summary() ``` ``` Call: lm(formula = pH ~ log_time, data = meat) Residuals: Min 1Q Median 3Q Max -0.1147 -0.0589 0.0209 0.0361 0.1166 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.9836 0.0485 143.9 6.1e-15 *** *log_time -0.7257 0.0344 -21.1 2.7e-08 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.0823 on 8 degrees of freedom Multiple R-squared: 0.982, Adjusted R-squared: 0.98 F-statistic: 444 on 1 and 8 DF, p-value: 2.7e-08 ``` --- ## Revisiting sampling and assignment <img src="images/randomization_selection_assignment.jpg" width="90%" style="display: block; margin: auto;" /> .pull-right[ .footnote[Source: *Statistical Sleuth*, Display 1.5] ] --- ## Revisiting sampling and assignment <img src="images/randomization_selection_assignment_markedup.jpg" width="90%" style="display: block; margin: auto;" /> .pull-right[ .footnote[Source: *Statistical Sleuth*, Display 1.5] ] --- ## Bootstrapping vs permutation/randomization test * Both use simulation or computational approximation * Bootstrapping + mimics random sampling + assumes sample represents population + draws new 'sample' from original sample + typically draws *with replacement* + *no* randomization of group assignment * Randomization test + mimics random assignment + assumes null hypothesis for effect of 'treatment' + randomizes group assignment + *no* replacement + *no* change in composition of sample --- class: center background-image: url("images/fish_pond.jpg") --- ## Intuition for sampling distribution with bootstrapping ```r meat %>% infer::specify(formula = pH ~ log_time) %>% * infer::generate(reps = 1, type = "bootstrap") ``` ``` Response: pH (numeric) Explanatory: log_time (numeric) # A tibble: 10 × 3 # Groups: replicate [1] replicate pH log_time <int> <dbl> <dbl> 1 1 6.42 0.693 2 1 5.36 2.08 3 1 5.8 1.79 4 1 5.36 2.08 5 1 5.8 1.79 6 1 6.93 0 7 1 5.99 1.39 8 1 5.99 1.39 9 1 5.59 1.79 10 1 5.36 2.08 ``` --- ## Now, let's generate 50 bootstrapped samples - `infer` calls each new bootstrapped sample a replicate ```r meat_bootstrap <- meat %>% infer::specify(formula = pH ~ log_time) %>% infer::generate(reps = 50, type = "bootstrap") dim(meat_bootstrap) ``` ``` [1] 500 3 ``` ```r meat_bootstrap %>% head(10) ``` ``` # A tibble: 10 × 3 # Groups: replicate [1] replicate pH log_time <int> <dbl> <dbl> 1 1 7.02 0 2 1 5.8 1.79 3 1 5.59 1.79 4 1 5.99 1.39 5 1 7.02 0 6 1 6.51 0.693 7 1 5.8 1.79 8 1 5.36 2.08 9 1 6.42 0.693 10 1 5.51 2.08 ``` --- ## Let's look at 50th replicate - `infer` calls each new bootstrapped sample a replicate ```r meat_bootstrap %>% tail(10) ``` ``` # A tibble: 10 × 3 # Groups: replicate [1] replicate pH log_time <int> <dbl> <dbl> 1 50 6.07 1.39 2 50 6.93 0 3 50 6.42 0.693 4 50 7.02 0 5 50 6.51 0.693 6 50 6.07 1.39 7 50 6.51 0.693 8 50 6.42 0.693 9 50 5.51 2.08 10 50 6.93 0 ``` --- ## Compare original data & one bootstrapped sample <table class="table" style="font-size: 14px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> log_time_orig </th> <th style="text-align:right;"> pH_orig </th> <th style="text-align:right;"> log_time_rep1 </th> <th style="text-align:right;"> pH_rep1 </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 6.93 </td> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 7.02 </td> </tr> <tr> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 7.02 </td> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 7.02 </td> </tr> <tr> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.42 </td> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.42 </td> </tr> <tr> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> </tr> <tr> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 5.99 </td> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 5.99 </td> </tr> <tr> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 6.07 </td> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.59 </td> </tr> <tr> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.59 </td> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.80 </td> </tr> <tr> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.80 </td> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.80 </td> </tr> <tr> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.36 </td> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.36 </td> </tr> <tr> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.51 </td> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.51 </td> </tr> </tbody> </table> --- ## Compare original data & two bootstrapped samples <table class="table" style="font-size: 14px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> log_time_orig </th> <th style="text-align:right;"> pH_orig </th> <th style="text-align:right;"> log_time_rep2 </th> <th style="text-align:right;"> pH_rep2 </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 6.93 </td> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 6.93 </td> </tr> <tr> <td style="text-align:right;"> 0.000 </td> <td style="text-align:right;"> 7.02 </td> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> </tr> <tr> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.42 </td> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> </tr> <tr> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> <td style="text-align:right;"> 0.693 </td> <td style="text-align:right;"> 6.51 </td> </tr> <tr> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 5.99 </td> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 5.99 </td> </tr> <tr> <td style="text-align:right;"> 1.386 </td> <td style="text-align:right;"> 6.07 </td> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.59 </td> </tr> <tr> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.59 </td> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.80 </td> </tr> <tr> <td style="text-align:right;"> 1.792 </td> <td style="text-align:right;"> 5.80 </td> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.36 </td> </tr> <tr> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.36 </td> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.51 </td> </tr> <tr> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.51 </td> <td style="text-align:right;"> 2.079 </td> <td style="text-align:right;"> 5.51 </td> </tr> </tbody> </table> --- ```r *meat_bootstrap %>% filter(replicate %in% 1:2) %>% ggplot() + aes(x = log_time, y = pH, color = factor(replicate)) + # geom_jitter "jitters" points to reduce overlap geom_jitter(width = .02, height = .02, alpha = .4) + geom_smooth(method = "lm", se = FALSE, alpha = .3) ``` <img src="week08_01_files/figure-html/unnamed-chunk-39-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat_bootstrap %>% filter(replicate %in% 1:5) %>% ggplot() + aes(x = log_time, y = pH, color = factor(replicate)) + geom_jitter(width = .02, height = .02, alpha = .4) + geom_smooth(method = "lm", se = FALSE, alpha = .3) ``` <img src="week08_01_files/figure-html/unnamed-chunk-40-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat_bootstrap %>% filter(replicate %in% 1:10) %>% ggplot() + aes(x = log_time, y = pH, color = factor(replicate)) + geom_jitter(width = .02, height = .02, alpha = .4) + geom_smooth(method = "lm", se = FALSE, alpha = .3) ``` <img src="week08_01_files/figure-html/unnamed-chunk-41-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat_bootstrap %>% filter(replicate %in% 1:20) %>% ggplot() + aes(x = log_time, y = pH, color = factor(replicate)) + geom_jitter(width = .02, height = .02, alpha = .4) + geom_smooth(method = "lm", se = FALSE, alpha = .3) ``` <img src="week08_01_files/figure-html/unnamed-chunk-42-1.png" width="792" style="display: block; margin: auto;" /> --- ```r meat_bootstrap %>% ggplot() + aes(x = log_time, y = pH, color = factor(replicate)) + geom_jitter(width = .02, height = .02, alpha = .4) + geom_smooth(method = "lm", se = FALSE, alpha = .3) ``` <img src="week08_01_files/figure-html/unnamed-chunk-43-1.png" width="792" style="display: block; margin: auto;" /> --- ## Now we calculate slope with base R & infer ```r # base R regression slope lm(formula = pH ~ log_time, data = meat) %>% broom::tidy() %>% kable(digits = 3) ``` <table> <thead> <tr> <th style="text-align:left;"> term </th> <th style="text-align:right;"> estimate </th> <th style="text-align:right;"> std.error </th> <th style="text-align:right;"> statistic </th> <th style="text-align:right;"> p.value </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> (Intercept) </td> <td style="text-align:right;"> 6.984 </td> <td style="text-align:right;"> 0.049 </td> <td style="text-align:right;"> 143.90 </td> <td style="text-align:right;"> 0 </td> </tr> <tr> <td style="text-align:left;"> log_time </td> <td style="text-align:right;"> -0.726 </td> <td style="text-align:right;"> 0.034 </td> <td style="text-align:right;"> -21.08 </td> <td style="text-align:right;"> 0 </td> </tr> </tbody> </table> ```r # infer regression slope meat %>% * specify(formula = pH ~ log_time) %>% * calculate(stat = "slope") ``` ``` Response: pH (numeric) Explanatory: log_time (numeric) # A tibble: 1 × 1 stat <dbl> 1 -0.726 ``` --- ## Now generate 1000 slopes ```r meat_slope_bootstrapped <- meat %>% specify(formula = pH ~ log_time) %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "slope") meat_slope_bootstrapped %>% filter(replicate %in% 1:10) ``` ``` Response: pH (numeric) Explanatory: log_time (numeric) # A tibble: 10 × 2 replicate stat <int> <dbl> 1 1 -0.738 2 2 -0.734 3 3 -0.676 4 4 -0.718 5 5 -0.698 6 6 -0.790 7 7 -0.640 8 8 -0.715 9 9 -0.614 10 10 -0.719 ``` --- ## `visualize` 100 slopes ```r meat_slope_bootstrapped %>% filter(replicate %in% 1:100) %>% infer::visualize() + ggtitle("Bootstrapped Sampling Distribution") ``` <img src="week08_01_files/figure-html/unnamed-chunk-46-1.png" width="50%" style="display: block; margin: auto;" /> --- ## `visualize` 1000 slopes ```r meat_slope_bootstrapped %>% infer::visualize() + ggtitle("Bootstrapped Sampling Distribution") ``` <img src="week08_01_files/figure-html/unnamed-chunk-47-1.png" width="50%" style="display: block; margin: auto;" /> --- ## `visualize` 1000 slopes with line at 0 ```r meat_slope_bootstrapped %>% infer::visualize() + scale_x_continuous(limits = c(-1,0)) + geom_vline(xintercept = 0, color = "red") + ggtitle("Bootstrapped Sampling Distribution") ``` <img src="week08_01_files/figure-html/unnamed-chunk-48-1.png" width="50%" style="display: block; margin: auto;" /> --- ## What is our bootstrapped `\(\beta_1\)` and Std. Error? ```r # Again, base R regression lm(formula = pH ~ log_time, data = meat) %>% broom::tidy() %>% kable(digits = 3) ``` <table> <thead> <tr> <th style="text-align:left;"> term </th> <th style="text-align:right;"> estimate </th> <th style="text-align:right;"> std.error </th> <th style="text-align:right;"> statistic </th> <th style="text-align:right;"> p.value </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> (Intercept) </td> <td style="text-align:right;"> 6.984 </td> <td style="text-align:right;"> 0.049 </td> <td style="text-align:right;"> 143.90 </td> <td style="text-align:right;"> 0 </td> </tr> <tr> <td style="text-align:left;"> log_time </td> <td style="text-align:right;"> -0.726 </td> <td style="text-align:right;"> 0.034 </td> <td style="text-align:right;"> -21.08 </td> <td style="text-align:right;"> 0 </td> </tr> </tbody> </table> ```r # estimated beta_1 is mean of sampling distribution mean(meat_slope_bootstrapped$stat) ``` ``` [1] -0.7268 ``` ```r # estimated se is sd of sampling distribution sd(meat_slope_bootstrapped$stat) ``` ``` [1] 0.03494 ``` --- ## Sophie Hill: Eyeball Regression <img src="images/sophie_hill_eyeball_regression_tweet.png" width="75%" style="display: block; margin: auto;" /> --- ## Sophie Hill: Eyeball Regression <img src="images/sophie_hill_eyeball_regression.png" width="75%" style="display: block; margin: auto;" /> .footnote[https://sophieehill.shinyapps.io/eyeball-regression/] --- class: center, middle # Questions?