class: center, middle, inverse, title-slide # POL90: Applied Quantitative Analysis ## Chapter 7: Simple Linear Regression ### Prof WasowAssistant Professor, Politics ### 2022-03-02 --- <style type="text/css"> .regression10 table { font-size: 10px; } .regression12 table { font-size: 12px; } .regression14 table { font-size: 14px; } </style> # Announcements .large[ * Assignments + Report 1 ] -- .large[ * Statistical Sleuth + Read Chapter 7 + Supplement - http://appliedstats.org/chapter7.html ] --- # Schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Title </th> <th style="text-align:right;"> Chapter </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Feb 16 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> A Closer Look at Assumptions </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Feb 21 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Alternatives to the t-Tools </td> <td style="text-align:right;"> 4 </td> </tr> <tr> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Feb 23 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Comparison Among Several Samples </td> <td style="text-align:right;"> 5 </td> </tr> <tr> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Feb 28 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Comparison Among Several Samples </td> <td style="text-align:right;"> 5 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 7 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mar 2 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Wed </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Simple Linear Regression </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 7 </td> </tr> <tr> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Mar 7 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Simple Linear Regression </td> <td style="text-align:right;"> 7 </td> </tr> <tr> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Mar 9 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Regression by Calculation </td> <td style="text-align:right;"> 7 </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 14 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Spring Recess </td> <td style="text-align:right;"> - </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 16 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Spring Recess </td> <td style="text-align:right;"> - </td> </tr> <tr> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Mar 21 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Null hypothesis, R-squared </td> <td style="text-align:right;"> 8 </td> </tr> </tbody> </table> --- ## Assignment schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Assignment </th> <th style="text-align:right;"> Percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Feb 25 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS05 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 7 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mar 4 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Fri </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Report1 </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 6 </td> </tr> <tr> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Mar 11 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS06 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Mar 18 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Spring break </td> <td style="text-align:right;"> NA </td> </tr> <tr> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Mar 25 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS07 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Apr 1 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS08 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Apr 8 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Report2 </td> <td style="text-align:right;"> 8 </td> </tr> <tr> <td style="text-align:right;"> 13 </td> <td style="text-align:left;"> Apr 15 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS09 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> Apr 22 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS10 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> Apr 29 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> Report3 </td> <td style="text-align:right;"> 10 </td> </tr> </tbody> </table> --- class: center, middle # Wrapping up Chapter 5: # Multiple comparisons # with ANOVA: Spock Trial --- ## Three ratios, Single Mean vs Seven Mean <br><br><br> <img src="images/f_stat_equation_anova_table16.png" width="1284" style="display: block; margin: auto;" /> --- ## Calculating *p*-value We can those statistics to calculate the probability of getting an *F*-statistic as extreme or more extreme on an F-distribution with numerator degrees of freedom of 6 and denominator degrees of freedom of 39: ```r # Setting lower.tail = FALSE gives us right tail pf(6.72, 6, 39, lower.tail = FALSE) ``` ``` [1] 0.00006082 ``` The *p*-value is extremely small, therefore we can reject the null hypothesis that all the means are equal. --- ## Visualizing *F* (6, 39) & *F*-stat = 6.72 ```r visualize::visualize.f(stat = 6.72, df1 = 6, df2 = 39, section = "upper") ``` <img src="week07_02_files/figure-html/unnamed-chunk-8-1.png" width="576" style="display: block; margin: auto;" /> --- ## Check results with `aov` To check the robustness of our manual calculation, we can use the built-in `aov()` function: ```r aov(Percent ~ Judge, data = spock) %>% summary() ``` ``` Df Sum Sq Mean Sq F value Pr(>F) Judge 6 1927 321 6.72 0.000061 *** Residuals 39 1864 48 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` --- class: middle, center # Calculating the *F*-statistic # With ANOVA Tables --- ## ANOVA Table: Equal means vs two means * Start with RSS and df for reduced model .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | ( ) | ( ) | ( ) | ( ) | ( ) | | (Two Means) | ( ) | ( ) | ( ) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Incorporate RSS and df for full model .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | ( ) | ( ) | ( ) | ( ) | ( ) | | (Two Means) | (2190.90) | (44) | () | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Divide `\(RSS_{full}\)` by `\(df_{full}\)` to calculate variance of `\(RSS_{full}\)` or `\(s_p^2\)` .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | () | () | () | ( ) | ( ) | | (Two Means) | (2190.90) | (44) | (49.79) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Calculate `\(RSS_{reduced} - RSS_{full}\)` and `\(df_{reduced}\)` - `\(df_{full}\)` .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (1600.63) | (1) | () | ( ) | ( ) | | (Two Means) | (2190.90) | (44) | (49.79) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Calculate Mean Square by dividing ESS by ( `\(df_{reduced} - df_{full}\)` ) .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (1600.63) | (1) | (1600.63) | ( ) | ( ) | | (Two Means) | (2190.90) | (44) | (49.79) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Calculate *F*-statistic by dividing Mean Square terms .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (1600.63) | (1) | (1600.63) | (32.15) | ( ) | | (Two Means) | (2190.90) | (44) | (49.79) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## ANOVA Table: Equal means vs two means * Calculate `\(p\)`-value. In `R`: `pf`( *F*-statistic, `\(df_{reduced}\)` , `\(df_{full}\)` ) .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|------------| | `\(RSS_{reduced} - RSS_{full}\)` | (1600.63) | (1) | (1600.63) | (32.15) | (0.000001) | | (Two Means) | (2190.90) | (44) | (49.79) | | | | (Equal Means) | 3,791.53 | 45 | | | | ] --- ## Visualizing *F* (1, 44) & *F*-stat = 32.15 ```r visualize::visualize.f(stat = 32.15, df1 = 1, df2 = 44, section = "upper") ``` <img src="week07_02_files/figure-html/unnamed-chunk-12-1.png" width="576" style="display: block; margin: auto;" /> --- class: middle, center # Two means vs seven means --- ## ANOVA Table: Two means vs seven means * Start with RSS and df for reduced model .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | ( ) | ( ) | ( ) | ( ) | ( ) | | (Seven Means) | () | () | () | | | | (Two Means) | 2190.90 | 44 | | | | ] --- ## ANOVA Table: Two means vs seven means * Incorporate RSS and df for full model .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | ( ) | ( ) | ( ) | ( ) | ( ) | | (Seven Means) | (1864.45) | (39) | () | | | | (Two Means) | 2190.90 | 44 | | | | ] --- ## ANOVA Table: Two means vs seven means * Divide `\(RSS_{full}\)` by `\(df_{full}\)` to calculate variance of `\(RSS_{full}\)` or `\(s_p^2\)` .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | ( ) | ( ) | ( ) | ( ) | ( ) | | (Seven Means) | (1864.45) | (39) | (47.81) | | | | (Two Means) | 2190.90 | 44 | | | | ] --- ## ANOVA Table: Two means vs seven means * Calculate `\(RSS_{reduced} - RSS_{full}\)` and `\(df_{reduced}\)` - `\(df_{full}\)` .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (326.45) | (5) | () | () | () | | (Seven Means) | (1864.45) | (39) | (47.81) | | | | (Two Means) | (2190.90) | (44) | | | | ] --- ## ANOVA Table: Two means vs seven means * Calculate Mean Square by dividing ESS by ( `\(df_{reduced} - df_{full}\)` ) .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (326.45) | (5) | (65.29) | () | () | | (Seven Means) | (1864.45) | (39) | (47.81) | | | | (Two Means) | (2190.90) | (44) | | | | ] --- ## ANOVA Table: Two means vs seven means * Calculate *F*-statistic by dividing Mean Square terms .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (326.45) | (5) | (65.29) | (1.37) | ( ) | | (Seven Means) | (1864.45) | (39) | (47.81) | | | | (Two Means) | (2190.90) | (44) | | | | ] --- ## ANOVA Table: Two means vs seven means * Calculate `\(p\)`-value. In `R`: `pf`( *F*-statistic, `\(df_{reduced}\)` , `\(df_{full}\)` ) .vertical-center[ | Source of Variation | Sum of Squares | d.f. | Mean Square | *F*-statistic | *p*-value | |------------------------------|----------------|------|-------------|-------------|---------| | `\(RSS_{reduced} - RSS_{full}\)` | (326.45) | (5) | (65.29) | (1.37) | (0.26) | | (Seven Means) | (1864.45) | (39) | (47.81) | | | | (Two Means) | (2190.90) | (44) | | | | ] --- ## Visualizing *F*(5, 44) & *F*-stat = 1.37 ```r visualize::visualize.f(stat = 1.37, df1 = 5, df2 = 44, section = "upper") ``` <img src="week07_02_files/figure-html/unnamed-chunk-15-1.png" width="576" style="display: block; margin: auto;" /> --- class: middle, center # Question: Which model would you recommend? Why? --- ## Spock study findings .vertical-center[ .large[ - "The percentages of women on Spock's judge's venires (with an average of 15%) were substantially lower than those of the other judges (with an average of 30%)." - "The one-sided *p*-value from a two-sample *t*-test comparing the mean percentage of Spock's judge to the mean percentage of all others combined is less than 0.000001." ] ] --- ## Summarizing Chapter 5 <br><br> .large[ * Three big lessons of ANOVA - Parsimony vs explanatory power - Contest of theories - Science is about explaining unexplained variation * Does evidence suggest Spock's venire's included fewer women than other judges? * For mice, does evidence suggest caloric restriction increased lifespan? ] --- class: middle, center, inverse # Simple Linear Regression --- ## Regression Terminology .vertical-center[ .large[ - <span style="color:red">Regression Analysis</span> is to describe the distribution of values of one variable, the response, as a function of other explanatory variables - <span style="color:red">Response Variable</span> `\((Y):\)` variable whose probability distribution is to be explained (also called dependent/endogenous variable) - <span style="color:red">Explanatory Variables</span> `\((X):\)` variables used to explain the distribution of `\(Y\)` (also called as independent/exogenous) variables ] ] --- ## Regression Terminology .vertical-center[ .large[ - The simple linear regression model specifies that this relationship is a straight-line function of one explanatory variable `\begin{eqnarray*} \mu\{Y|X\} = \beta_0 + \beta_1 \times X \end{eqnarray*}` - where `\(\beta_0\)` is the intercept and `\(\beta_1\)` is the slope of regression line. ] ] --- ## Load Spock Data ```r spock <- Sleuth3::case0502 spock$Judge <- fct_relevel(spock$Judge, c("Spock's", "A", "B", "C", "D", "E", "F") ) head(spock, 12) ``` ``` Percent Judge 1 6.4 Spock's 2 8.7 Spock's 3 13.3 Spock's 4 13.6 Spock's 5 15.0 Spock's 6 15.2 Spock's 7 17.7 Spock's 8 18.6 Spock's 9 23.1 Spock's 10 16.8 A 11 30.8 A 12 33.6 A ``` ```r # calculate the mean mean(spock$Percent) ``` ``` [1] 26.58 ``` --- ## Visualize Single Intercept, No Slope <img src="week07_02_files/figure-html/unnamed-chunk-18-1.png" width="720" style="display: block; margin: auto;" /> --- ## Simplest Regression: Single Intercept <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} = \beta_0 + \beta_1 \times X \end{eqnarray*}` ] --- ## Simplest Regression: Single Intercept <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} = 26.58 + (0) \times X \end{eqnarray*}` ] --- ## Simplest Regression: Single Intercept ```r # use regression to calculate a single intercept (1) lm(Percent ~ 1, data = spock) %>% summary() ``` ``` Call: lm(formula = Percent ~ 1, data = spock) Residuals: Min 1Q Median 3Q Max -20.183 -6.633 0.917 5.792 22.317 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 26.58 1.35 19.6 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 9.18 on 45 degrees of freedom ``` --- ## Regression: One Binary Explanatory Variable ```r spock <- spock %>% mutate(not_spock_bin = case_when( Judge != "Spock's" ~ 1, Judge == "Spock's" ~ 0)) spock %>% select(Judge, Percent, not_spock_bin) %>% head(2) ``` ``` Judge Percent not_spock_bin 1 Spock's 6.4 0 2 Spock's 8.7 0 ``` ```r spock %>% select(Judge, Percent, not_spock_bin) %>% sample_n(10) ``` ``` Judge Percent not_spock_bin 1 F 26.4 1 2 E 17.7 1 3 C 30.5 1 4 E 34.8 1 5 F 23.5 1 6 A 30.8 1 7 F 16.5 1 8 Spock's 23.1 0 9 Spock's 15.0 0 10 B 27.0 1 ``` --- ## Visualizing One Intercept, One Shift, No Slope <img src="week07_02_files/figure-html/unnamed-chunk-21-1.png" width="720" style="display: block; margin: auto;" /> --- ## Regression: One Binary Explanatory Variable ```r lm(Percent ~ not_spock_bin, data = spock) %>% summary() ``` ``` Call: lm(formula = Percent ~ not_spock_bin, data = spock) Residuals: Min 1Q Median 3Q Max -12.992 -4.667 0.258 3.785 19.408 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 14.62 2.35 6.22 0.00000016 *** *not_spock_bin 14.87 2.62 5.67 0.00000103 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 7.06 on 44 degrees of freedom Multiple R-squared: 0.422, Adjusted R-squared: 0.409 F-statistic: 32.1 on 1 and 44 DF, p-value: 0.00000103 ``` --- ## Regression: Single Intercept, One Shift, No Slope <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} = \beta_0 + \beta_1 \times \textrm{not_spock_bin} \end{eqnarray*}` ] --- ## Regression: Single Intercept, One Shift, No Slope <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} = 14.62 + (+14.87) \times \textrm{not_spock_bin} \end{eqnarray*}` ] --- ## When `not_spock_bin` = 0 <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} &=& 14.62 + (+14.87) \times (0) \end{eqnarray*}` ] --- ## When `not_spock_bin` = 0 <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} &=& 14.62 + (+14.87) \times (0) \\ &=& 14.62 \end{eqnarray*}` ] --- ## When `not_spock_bin` = 1 <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} &=& 14.62 + (+14.87) \times (1) \end{eqnarray*}` ] --- ## When `not_spock_bin` = 1 <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} &=& 14.62 + (+14.87) \times (1) \\ &=& 14.62 +14.87 \\ &=& 29.49 \end{eqnarray*}` ] --- class: center, middle # Regression: One Categorical Variable --- ## Regression: Single Intercept, Multiple Shifts, No Slope <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \\ & & \beta_1 \times \textrm{JudgeA} + \\ & & \beta_2 \times \textrm{JudgeB} + \\ & & \beta_3 \times \textrm{JudgeC} + \\ & & \beta_4 \times \textrm{JudgeD} + \\ & & \beta_5 \times \textrm{JudgeE} + \\ & & \beta_6 \times \textrm{JudgeF} \end{eqnarray*}` ] --- ## Visualizing One Intercept, Multiple Shifts, No Slope <img src="week07_02_files/figure-html/unnamed-chunk-23-1.png" width="720" style="display: block; margin: auto;" /> --- ## Regression: One Categorical Variable ```r lm(Percent ~ Judge, data = spock) %>% summary() ``` ``` Call: lm(formula = Percent ~ Judge, data = spock) Residuals: Min 1Q Median 3Q Max -17.32 -4.37 -0.25 3.32 14.78 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 14.62 2.30 6.34 0.00000017 *** JudgeA 19.50 3.86 5.06 0.00001050 *** JudgeB 18.99 3.64 5.21 0.00000639 *** JudgeC 14.48 3.26 4.44 0.00007153 *** JudgeD 12.38 5.41 2.29 0.0275 * JudgeE 12.34 3.64 3.39 0.0016 ** JudgeF 12.18 3.26 3.74 0.0006 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 6.91 on 39 degrees of freedom Multiple R-squared: 0.508, Adjusted R-squared: 0.433 F-statistic: 6.72 on 6 and 39 DF, p-value: 0.000061 ``` --- ## Regression: Single Intercept, Multiple Shifts, No Slope <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} & = & 14.62 + \\ & & (+19.49) \times \textrm{JudgeA} + \\ & & (+18.99) \times \textrm{JudgeB} + \\ & & (+14.47) \times \textrm{JudgeC} + \\ & & (+12.37) \times \textrm{JudgeD} + \\ & & (+12.34) \times \textrm{JudgeE} + \\ & & (+12.17) \times \textrm{JudgeF} \end{eqnarray*}` ] --- ## When `Judge` == "Spock's" <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} & = & 14.62 + \\ & & (+19.49) \times (0) + \\ & & (+18.99) \times (0) + \\ & & (+14.47) \times (0) + \\ & & (+12.37) \times (0) + \\ & & (+12.34) \times (0) + \\ & & (+12.17) \times (0) \end{eqnarray*}` ] --- ## When `Judge` == "A" <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} & = & 14.62 + \\ & & (+19.49) \times (1) + \\ & & (+18.99) \times (0) + \\ & & (+14.47) \times (0) + \\ & & (+12.37) \times (0) + \\ & & (+12.34) \times (0) + \\ & & (+12.17) \times (0) \end{eqnarray*}` ] --- ## When `Judge` == "C" <br><br><br><br> .large[ `\begin{eqnarray*} \mu\{Y|X\} & = & 14.62 + \\ & & (+19.49) \times (0) + \\ & & (+18.99) \times (0) + \\ & & (+14.47) \times (1) + \\ & & (+12.37) \times (0) + \\ & & (+12.34) \times (0) + \\ & & (+12.17) \times (0) \end{eqnarray*}` ] --- ## Categorical Estimates = Separate Means ```r #JudgeA 14.622 + 19.498 ``` ``` [1] 34.12 ``` ```r #JudgeC 14.622 + 14.478 ``` ``` [1] 29.1 ``` ```r spock %>% group_by(Judge) %>% summarize(mean_judge = mean(Percent)) %>% ungroup() %>% as.data.frame() ``` ``` Judge mean_judge 1 Spock's 14.62 2 A 34.12 3 B 33.62 4 C 29.10 5 D 27.00 6 E 26.97 7 F 26.80 ``` --- class: middle, center # Regression with Slopes: # Sunspot Activity & Skin Cancer --- ## Sunspot Activity & Skin Cancer .large[ - Data - `cancer_rate` indicates yearly skin cancer rates (per 100,000 people) in Connecticut from 1938 to 1972 - `sunspot_activity` indicates those years that came two years after higher than average sunspot activity and those years that came two years after lower than average sunspot activity. - `year` - Source: *Statistical Sleuth*, Chapter 3, Problem 23. Data from D. F. Andrews and A. M. Herzberg, Data, New York: Springer-Verlag, 1985. ] --- ## Load data ```r solar <- Sleuth3::ex0323 %>% clean_names() dim(solar) ``` ``` [1] 35 3 ``` ```r head(solar, 3) ``` ``` year cancer_rate sunspot_activity 1 1938 0.8 Low 2 1939 1.3 High 3 1940 1.4 High ``` ```r tail(solar, 3) ``` ``` year cancer_rate sunspot_activity 33 1970 4.8 High 34 1971 4.8 High 35 1972 4.8 High ``` --- ## Plot data ```r solar %>% ggplot() + aes(y = cancer_rate, x = sunspot_activity) + geom_boxplot() ``` <img src="week07_02_files/figure-html/unnamed-chunk-27-1.png" width="45%" style="display: block; margin: auto;" /> --- ## Test for Difference in Means ```r t.test(cancer_rate ~ sunspot_activity, data = solar, var.equal = TRUE) ``` ``` Two Sample t-test data: cancer_rate by sunspot_activity *t = 1.1, df = 33, p-value = 0.3 alternative hypothesis: true difference in means between group High and group Low is not equal to 0 95 percent confidence interval: -0.3724 1.2591 sample estimates: mean in group High mean in group Low 3.173 2.730 ``` -- ```r mean_low <- mean(solar$cancer_rate[solar$sunspot_activity == "Low"]) mean_high <- mean(solar$cancer_rate[solar$sunspot_activity == "High"]) mean_high - mean_low ``` ``` [1] 0.4433 ``` --- ## Visualizing Differeince in Means <img src="week07_02_files/figure-html/unnamed-chunk-30-1.png" width="80%" style="display: block; margin: auto;" /> --- class: middle, center # Simple Regression: # Dummy Variables --- ## Single Mean Model, No Slope ```r # single mean model (one intercept) lm_mean <- lm(cancer_rate ~ 1, data = solar) summary(lm_mean) ``` ``` Call: lm(formula = cancer_rate ~ 1, data = solar) Residuals: Min 1Q Median 3Q Max -2.12 -1.02 -0.02 0.93 1.88 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 2.920 0.199 14.7 2.9e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.18 on 34 degrees of freedom ``` --- ## Single Mean Model, No Slope <br><br><br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate}\} & = & \beta_{0} \\ \mu\{\operatorname{cancer\_rate}\} & = & 2.92 \end{eqnarray*}` --- ## Single Mean Model, Intercept, No Slope <img src="week07_02_files/figure-html/unnamed-chunk-33-1.png" width="90%" style="display: block; margin: auto;" /> --- ## One Intercept, One Dummy, No Slope ```r # two mean model (two intercepts) lm_sunspot <- lm(cancer_rate ~ sunspot_activity, data = solar) summary(lm_sunspot) ``` ``` Call: lm(formula = cancer_rate ~ sunspot_activity, data = solar) Residuals: Min 1Q Median 3Q Max -1.973 -0.830 -0.130 0.998 1.970 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) 3.173 0.303 10.47 0.0000000000051 *** *sunspot_activityLow -0.443 0.401 -1.11 0.28 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.17 on 33 degrees of freedom Multiple R-squared: 0.0357, Adjusted R-squared: 0.0065 F-statistic: 1.22 on 1 and 33 DF, p-value: 0.277 ``` --- ## Equation Intercept, Dummy, No Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & \beta_{0} + \beta_{1}(\operatorname{sunspot\_activity}_{\operatorname{Low}}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(\operatorname{sunspot\_activity}_{\operatorname{Low}}) \end{eqnarray*}` -- - .large[Example: Sunspot Activity High] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(0) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 \end{eqnarray*}` -- - .large[Example: Sunspot Activity Low] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 3.17 - 0.44(1) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot\_activity}\} & = & 2.73 \end{eqnarray*}` --- ## Plot of Intercept, Dummy, No Slope <img src="week07_02_files/figure-html/unnamed-chunk-36-1.png" width="90%" style="display: block; margin: auto;" /> --- class: middle, center # Simple Regression: # Conditional Mean --- ## Conditional Mean, Intercept and Slope ```r # single conditional model (one slope, one intercept) lm_year <- lm(cancer_rate ~ year, data = solar) summary(lm_year) ``` ``` Call: lm(formula = cancer_rate ~ year, data = solar) Residuals: Min 1Q Median 3Q Max -0.61 -0.23 -0.02 0.22 0.73 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) -212.18476 11.30171 -18.8 <2e-16 *** *year 0.11003 0.00578 19.0 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.345 on 33 degrees of freedom Multiple R-squared: 0.917, Adjusted R-squared: 0.914 F-statistic: 362 on 1 and 33 DF, p-value: <2e-16 ``` --- ## Conditional Mean Model, Intercept + Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & \beta_{0} + \beta_{1}(\operatorname{year}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(\operatorname{year}) \end{eqnarray*}` -- - .large[Example: 1960] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(1960) \\ & = & -212.18 + 215.6 \\ & = & 3.42 \\ \end{eqnarray*}` -- - .large[Example: 1961] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{year}\} & = & -212.18 + 0.11(1961) \\ & = & 3.53 \\ \end{eqnarray*}` --- ## Conditional Mean, Intercept and Slope <img src="week07_02_files/figure-html/unnamed-chunk-38-1.png" width="90%" style="display: block; margin: auto;" /> --- class: middle, center # Multiple Regression --- ## Multiple Regression: Intercept, Dummy and Slope ```r # single conditional mean + mean shift (one slope, two intercepts) lm_sunspot_and_year <- lm(cancer_rate ~ sunspot_activity + year, data = solar) summary(lm_sunspot_and_year) ``` ``` Call: lm(formula = cancer_rate ~ sunspot_activity + year, data = solar) Residuals: Min 1Q Median 3Q Max -0.4329 -0.2365 -0.0203 0.1648 0.5396 Coefficients: Estimate Std. Error t value Pr(>|t|) *(Intercept) -211.93143 8.68095 -24.41 < 2e-16 *** *sunspot_activityLow -0.44333 0.09062 -4.89 0.000027 *** *year 0.11003 0.00444 24.78 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.265 on 32 degrees of freedom Multiple R-squared: 0.952, Adjusted R-squared: 0.949 F-statistic: 319 on 2 and 32 DF, p-value: <2e-16 ``` --- ## Conditional Mean, Intercept, Dummy and Slope <br> `\begin{eqnarray*} \mu\{Y|X\} & = & \beta_0 + \beta_1(X_1) + \beta_2(X_2) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & \beta_{0} + \beta_{1}(\operatorname{sunspot}_{\operatorname{Low}}) + \beta_{2}(\operatorname{year}) \\ \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(\operatorname{sunspot}_{\operatorname{Low}}) + \\ & & + 0.11(\operatorname{year}) \end{eqnarray*}` -- - .large[Example: year = 1957, sunspot = Low] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(1) + 0.11(1957) \\ & = & 2.95 \end{eqnarray*}` -- - .large[Example: year = 1960, sunspot = High] `\begin{eqnarray*} \mu\{\operatorname{cancer\_rate} | \operatorname{sunspot}_{\operatorname{Low}}, \operatorname{year}\} & = & -211.93 -0.44(0) + 0.11(1960) \\ & = & 3.73 \end{eqnarray*}` --- ## Conditional Mean, Intercept, Dummy and Slope <img src="week07_02_files/figure-html/unnamed-chunk-40-1.png" width="90%" style="display: block; margin: auto;" /> --- class: regression14 ## Look at table of results ```r stargazer(lm_mean, lm_sunspot, lm_year, lm_sunspot_and_year, type = "html", header = FALSE, digits = 2, omit.stat = c("f", "ser", "rsq", "adj.rsq"), single.row = TRUE, intercept.bottom = FALSE ) ``` <table style="text-align:center"><tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="4"><em>Dependent variable:</em></td></tr> <tr><td></td><td colspan="4" style="border-bottom: 1px solid black"></td></tr> <tr><td style="text-align:left"></td><td colspan="4">cancer_rate</td></tr> <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td><td>(4)</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Constant</td><td>2.92<sup>***</sup> (0.20)</td><td>3.17<sup>***</sup> (0.30)</td><td>-212.20<sup>***</sup> (11.30)</td><td>-211.90<sup>***</sup> (8.68)</td></tr> <tr><td style="text-align:left">sunspot_activityLow</td><td></td><td>-0.44 (0.40)</td><td></td><td>-0.44<sup>***</sup> (0.09)</td></tr> <tr><td style="text-align:left">year</td><td></td><td></td><td>0.11<sup>***</sup> (0.01)</td><td>0.11<sup>***</sup> (0.004)</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>35</td><td>35</td><td>35</td><td>35</td></tr> <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="4" style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr> </table> --- ## Flocabulary example: Linear equations .center[ <img src="images/flocabulary_screenshot.jpg" width="85%" style="display: block; margin: auto;" /> ] .footnote[https://www.flocabulary.com/unit/linear-equations/] --- class: center, middle # Questions?