class: center, middle, inverse, title-slide # POL90: Statistics ## Missing Data ### Prof. Wasow, PoliticsPomona College ### 2022-04-11 --- <style type="text/css"> .regression10 table { font-size: 10px; } .regression12 table { font-size: 12px; } .regression14 table { font-size: 14px; } </style> # Announcements .large[ - Assignments - PS09 - Report 3 - Missing Data - Handout ] --- # Schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Title </th> <th style="text-align:right;"> Chapter </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Mar 28 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Interaction terms </td> <td style="text-align:right;"> 9 </td> </tr> <tr> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Mar 30 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Interaction terms </td> <td style="text-align:right;"> 9 </td> </tr> <tr> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Apr 4 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Logistic regression </td> <td style="text-align:right;"> 20 </td> </tr> <tr> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Apr 6 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Logistic regression </td> <td style="text-align:right;"> 20 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 13 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Apr 11 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mon </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Missing data </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> Handout </td> </tr> <tr> <td style="text-align:right;"> 13 </td> <td style="text-align:left;"> Apr 13 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Missing data </td> <td style="text-align:right;"> Handout </td> </tr> <tr> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> Apr 18 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Matching </td> <td style="text-align:right;"> Handout </td> </tr> <tr> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> Apr 20 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Matching </td> <td style="text-align:right;"> Handout </td> </tr> <tr> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> Apr 25 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Causal inference: Panel data </td> <td style="text-align:right;"> Handout </td> </tr> <tr> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> Apr 27 </td> <td style="text-align:left;"> Wed </td> <td style="text-align:left;"> Causal inference: Natural Experiments </td> <td style="text-align:right;"> Dunning </td> </tr> </tbody> </table> --- ## Assignment schedule <table> <thead> <tr> <th style="text-align:right;"> Week </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Day </th> <th style="text-align:left;"> Assignment </th> <th style="text-align:right;"> Percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Apr 1 </td> <td style="text-align:left;"> Fri </td> <td style="text-align:left;"> PS08 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Apr 11 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Report2 </td> <td style="text-align:right;"> 8 </td> </tr> <tr> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 13 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Apr 18 </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> Mon </td> <td style="text-align:left;color: black !important;background-color: yellow !important;"> PS09 </td> <td style="text-align:right;color: black !important;background-color: yellow !important;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> Apr 25 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> PS10 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> May 2 </td> <td style="text-align:left;"> Mon </td> <td style="text-align:left;"> Report3 </td> <td style="text-align:right;"> 10 </td> </tr> </tbody> </table> --- class: center, middle # Missing Data: Introduction --- ## Why do we care about missing data? .vertical-center[ <img src="images/missingness_NA_drop.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Why do we care about missing data? .vertical-center[ <img src="images/missingness_too_many_NAs.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Why do we care about missing data? .vertical-center[ <img src="images/edstem_filtering_data.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Why do we care about missing data? .vertical-center[ .large[ - Missingness is common to all real world data - What risks are there to "listwise deletion"? ] ] --- .vertical-center[ <img src="images/listwise_deletion_is_evil.png" width="80%" style="display: block; margin: auto;" /> ] .footnote["Listwise deletion is evil: what to do about missing data in political science," http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.199.5654&rep=rep1&type=pdf] --- ## Why do we care about missing data? .large[ - Different types of missingness - Missing at random might not bias results - Missingness that's systematic can lead to selection bias - Example: study of voting behavior biased by phone use - Loss of useful data - "Listwise deletion discards one-third of cases on average, which deletes both the few nonresponses and the many responses in those cases." - King, Honaker, Joseph, Scheve (2001) - Loss of *N*, loss of statistical power <!-- - Missing completely at random --> <!-- - Missingness that depends on unobserved predictors --> <!-- - Missingness that depends on the missing value itself --> <!-- - Hard question with lots of missingness --> <!-- - Good to check robustness pre- & post-imputation --> ] --- class: center, middle # Example: `airquality` --- ## Case `airquality` data - This data set has daily air quality measurements in New York from May to September 1973 over a period of 5 months. - Daily readings of the following air quality values for May 1, 1973 (a Tuesday) to September 30, 1973. - **Ozone**: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island - **Solar.R**: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park - **Wind**: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport - **Temp**: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport. .footnote[Source: The data were obtained from the New York State Department of Conservation (ozone data) and the National Weather Service (meteorological data). Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P. A. (1983) *Graphical Methods for Data Analysis*. Belmont, CA: Wadsworth.] --- ## Load `airquality` data ```r # https://github.com/ropenscilabs/skimr # quick overview of data data(airquality) # load data airquality <- airquality %>% clean_names() dim(airquality) ``` ``` [1] 153 6 ``` ```r # look for missingness head(airquality, 10) ``` ``` ozone solar_r wind temp month day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 *5 NA NA 14.3 56 5 5 *6 28 NA 14.9 66 5 6 7 23 299 8.6 65 5 7 8 19 99 13.8 59 5 8 9 8 19 20.1 61 5 9 *10 NA 194 8.6 69 5 10 ``` --- class: center, middle # Missing Data: # Exploratory Analysis --- ## Many packages now help with missingness ```r # viewing install.packages("skimr") install.packages("naniar") install.packages("dataMaid") # recoding install.packages("tidyr") install.packages("simputation") # advanced: multiple imputation / panel imputation install.packages("Amelia") install.packages("mi") install.packages("mice") install.packages("panelView") ``` --- ## Use `skim_without_charts` for Latex / pdfs .tiny[ ```r # another way to summarize data *skimr::skim_without_charts(airquality) ``` <table style='width: auto;' class='table table-condensed'> <caption>Data summary</caption> <thead> <tr> <th style="text-align:left;"> </th> <th style="text-align:left;"> </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Name </td> <td style="text-align:left;"> airquality </td> </tr> <tr> <td style="text-align:left;"> Number of rows </td> <td style="text-align:left;"> 153 </td> </tr> <tr> <td style="text-align:left;"> Number of columns </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> _______________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Column type frequency: </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> numeric </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> ________________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Group variables </td> <td style="text-align:left;"> None </td> </tr> </tbody> </table> **Variable type: numeric** <table> <thead> <tr> <th style="text-align:left;"> skim_variable </th> <th style="text-align:right;"> n_missing </th> <th style="text-align:right;"> complete_rate </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> sd </th> <th style="text-align:right;"> p0 </th> <th style="text-align:right;"> p25 </th> <th style="text-align:right;"> p50 </th> <th style="text-align:right;"> p75 </th> <th style="text-align:right;"> p100 </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> ozone </td> <td style="text-align:right;"> 37 </td> <td style="text-align:right;"> 0.76 </td> <td style="text-align:right;"> 42.13 </td> <td style="text-align:right;"> 32.99 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 18.0 </td> <td style="text-align:right;"> 31.5 </td> <td style="text-align:right;"> 63.25 </td> <td style="text-align:right;"> 168.0 </td> </tr> <tr> <td style="text-align:left;"> solar_r </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 0.95 </td> <td style="text-align:right;"> 185.93 </td> <td style="text-align:right;"> 90.06 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 115.8 </td> <td style="text-align:right;"> 205.0 </td> <td style="text-align:right;"> 258.75 </td> <td style="text-align:right;"> 334.0 </td> </tr> <tr> <td style="text-align:left;"> wind </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 9.96 </td> <td style="text-align:right;"> 3.52 </td> <td style="text-align:right;"> 1.7 </td> <td style="text-align:right;"> 7.4 </td> <td style="text-align:right;"> 9.7 </td> <td style="text-align:right;"> 11.50 </td> <td style="text-align:right;"> 20.7 </td> </tr> <tr> <td style="text-align:left;"> temp </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 77.88 </td> <td style="text-align:right;"> 9.47 </td> <td style="text-align:right;"> 56.0 </td> <td style="text-align:right;"> 72.0 </td> <td style="text-align:right;"> 79.0 </td> <td style="text-align:right;"> 85.00 </td> <td style="text-align:right;"> 97.0 </td> </tr> <tr> <td style="text-align:left;"> month </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 6.99 </td> <td style="text-align:right;"> 1.42 </td> <td style="text-align:right;"> 5.0 </td> <td style="text-align:right;"> 6.0 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 8.00 </td> <td style="text-align:right;"> 9.0 </td> </tr> <tr> <td style="text-align:left;"> day </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 15.80 </td> <td style="text-align:right;"> 8.86 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 8.0 </td> <td style="text-align:right;"> 16.0 </td> <td style="text-align:right;"> 23.00 </td> <td style="text-align:right;"> 31.0 </td> </tr> </tbody> </table> ] --- ## `naniar` also good for visualizing missingness ```r naniar::gg_miss_var(airquality) + theme_bw(base_size = 20) ``` <img src="week12_01_files/figure-html/unnamed-chunk-14-1.png" width="576" style="display: block; margin: auto;" /> --- ## Why `naniar`? Nick Tierney says “NA-in-R” <img src="images/njt-headshot-climb.png" width="90%" style="display: block; margin: auto;" /> .footnote[Photo credit: https://www.njtierney.com/about/] --- ## `naniar` offers multiple visulizations ```r naniar::vis_miss(airquality) ``` <img src="week12_01_files/figure-html/unnamed-chunk-16-1.png" width="504" style="display: block; margin: auto;" /> --- ## Can also easily facet (here by month) ```r # What does facetting by month suggest about missingness? naniar::gg_miss_var(airquality, facet = month) + theme_bw(base_size = 20) ``` <img src="week12_01_files/figure-html/unnamed-chunk-17-1.png" width="70%" style="display: block; margin: auto;" /> --- class: center, middle # Missing Data: Modeling # and Bivariate Relationships --- ## What happens with missingness in regression? .small[ ```r # what happens with missingness in regression? lm(ozone ~ solar_r, data = airquality) %>% summary() ``` ``` Call: lm(formula = ozone ~ solar_r, data = airquality) Residuals: Min 1Q Median 3Q Max -48.29 -21.36 -8.86 16.37 119.14 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 18.5987 6.7479 2.76 0.00686 ** solar_r 0.1272 0.0328 3.88 0.00018 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 31.3 on 109 degrees of freedom * (42 observations deleted due to missingness) Multiple R-squared: 0.121, Adjusted R-squared: 0.113 F-statistic: 15.1 on 1 and 109 DF, p-value: 0.000179 ``` ] --- ## What about scatter plots? (Note warning) ```r # https://github.com/njtierney/naniar airquality %>% ggplot() + aes(x = solar_r, y = ozone) + geom_point() ``` ``` Warning: Removed 42 rows containing missing values (geom_point). ``` <img src="week12_01_files/figure-html/visualizing_missingness-1.png" width="576" style="display: block; margin: auto;" /> --- ## Alternate approaches with `naniar` .left-code[ ```r # geom_miss_point shifts # missing values to # 10% below minimum airquality %>% ggplot() + aes(x = solar_r, y = ozone) + * geom_miss_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/visualizing_missingness2_plot-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## How can `naniar` plot missing values? .left-code[ ```r # geom_miss_point shifts # missing values to # 10% below minimum airquality %>% ggplot() + aes(x = solar_r, y = ozone) + * geom_miss_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-20-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## How can `naniar` plot missing values? .left-code[ ```r # geom_miss_point shifts # missing values to # 10% below minimum airquality %>% ggplot() + aes(x = solar_r, y = ozone) + * geom_miss_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-22-1.png" width="100%" style="display: block; margin: auto;" /> ] --- class: center, middle # Missing Data: Imputation --- ## What can we do about missingness? ```r # create new data frame air2 <- airquality %>% mutate( ozone_orig = ozone, * ozone_NA = ifelse(is.na(ozone), "NA", "!NA"), * ozone_impute = ifelse(is.na(ozone), mean(ozone, na.rm = TRUE), ozone) ) # view mean mean(air2$ozone, na.rm = TRUE) ``` ``` [1] 42.13 ``` ```r # view imputation air2 %>% select(ozone_orig, ozone_NA, ozone_impute) %>% head() ``` ``` ozone_orig ozone_NA ozone_impute 1 41 !NA 41.00 2 36 !NA 36.00 3 12 !NA 12.00 4 18 !NA 18.00 *5 NA NA 42.13 6 28 !NA 28.00 ``` --- ## Pros & cons of imputing the mean? .left-code[ ```r air2 %>% ggplot() + aes(x = solar_r, y = ozone_impute, color = ozone_NA) + geom_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-24-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## How about a conditional mean? ```r # could we just use regression? lm1 <- lm(ozone ~ solar_r, data = air2) summary(lm1) ``` ``` Call: lm(formula = ozone ~ solar_r, data = air2) Residuals: Min 1Q Median 3Q Max -48.29 -21.36 -8.86 16.37 119.14 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 18.5987 6.7479 2.76 0.00686 ** *solar_r 0.1272 0.0328 3.88 0.00018 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 31.3 on 109 degrees of freedom (42 observations deleted due to missingness) Multiple R-squared: 0.121, Adjusted R-squared: 0.113 F-statistic: 15.1 on 1 and 109 DF, p-value: 0.000179 ``` --- ## Model allows us to impute some values for `ozone` .vertical-center[ ```r equatiomatic::extract_eq(lm1, use_coefs = FALSE) ``` $$ \operatorname{ozone} = \alpha + \beta_{1}(\operatorname{solar\_r}) + \epsilon $$ ```r equatiomatic::extract_eq(lm1, use_coefs = TRUE) ``` $$ \operatorname{\widehat{ozone}} = 18.6 + 0.13(\operatorname{solar\_r}) $$ ] --- ## We can use a model to predict values of `ozone` ```r air2 <- air2 %>% mutate( ozone_bivariate = predict(lm1, data.frame(solar_r = air2$solar_r)), ozone_impute = ifelse(is.na(ozone), ozone_bivariate, ozone) ) air2 %>% select(solar_r, ozone, ozone_bivariate, ozone_impute) %>% head(10) ``` ``` solar_r ozone ozone_bivariate ozone_impute 1 190 41 42.76 41.00 2 118 36 33.60 36.00 3 149 12 37.55 12.00 4 313 18 58.40 18.00 *5 NA NA NA NA *6 NA 28 NA 28.00 7 299 23 56.62 23.00 8 99 19 31.19 19.00 9 19 8 21.01 8.00 *10 194 NA 43.27 43.27 ``` --- ## Visualizing `ozone` ~ `solar_r` .left-code[ ```r # could we use regression? air2 %>% ggplot() + aes(x = solar_r, y = ozone_impute, color = ozone_NA) + geom_smooth(method = "lm", se = FALSE, col = "gray80") + geom_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-29-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Bivariate linear model not great, why? .left-code[ ```r # could we use regression? air2 %>% ggplot() + aes(x = solar_r, y = ozone_impute, color = ozone_NA) + geom_smooth(method = "lm", se = FALSE, col = "gray80") + geom_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-31-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Bivariate linear model not great, why? .left-code[ ```r # could we use regression? air2 %>% ggplot() + aes(x = solar_r, y = ozone_impute, color = ozone_NA) + geom_smooth(method = "lm", se = FALSE, col = "gray80") + geom_point() ``` ] .right-plot[ <img src="week12_01_files/figure-html/unnamed-chunk-32-1.png" width="100%" style="display: block; margin: auto;" /> ] --- class: center, middle # Missing Data: Imputation # with Multiple Regression --- ## How else can we model `ozone`? .left-table[ ```r lm2 <- lm(ozone ~ solar_r + temp, data = airquality) stargazer(lm2, header = FALSE, omit.stat = c("f", "ser", "adj.rsq"), single.row = TRUE, digits = 2, type = 'html') ``` ] .right-table[ <table style="text-align:center"><tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td><em>Dependent variable:</em></td></tr> <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr> <tr><td style="text-align:left"></td><td>ozone</td></tr> <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">solar_r</td><td>0.06<sup>**</sup> (0.03)</td></tr> <tr><td style="text-align:left">temp</td><td>2.28<sup>***</sup> (0.25)</td></tr> <tr><td style="text-align:left">Constant</td><td>-145.70<sup>***</sup> (18.45)</td></tr> <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>111</td></tr> <tr><td style="text-align:left">R<sup>2</sup></td><td>0.51</td></tr> <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr> </table> ] --- ## Multiple regression model for `ozone` <br><br><br> ```r equatiomatic::extract_eq(lm2, use_coefs = FALSE) ``` `$$\text{ozone} = \alpha + \beta_{1}(\text{solar_r}) + \beta_{2}(\text{temp}) + \epsilon$$` ```r equatiomatic::extract_eq(lm2, use_coefs = TRUE) ``` $$ \operatorname{\widehat{ozone}} = -145.7 + 0.06(\operatorname{solar\_r}) + 2.28(\operatorname{temp}) $$ --- ## Try `simputation` with multiple regression ```r # https://cran.r-project.org/web/packages/simputation/vignettes/intro.html # add temp to model air_imp <- air2 %>% * simputation::impute_lm(ozone ~ solar_r + temp) # output is a data frame dim(air_imp) ``` ``` [1] 153 13 ``` --- ## Now `ozone` ~ `solar_r` + `temp` ```r air_imp %>% * filter(ozone_NA == "NA") %>% select(solar_r, temp, ozone_orig, ozone) %>% head(15) ``` ``` solar_r temp ozone_orig ozone 1 NA 56 NA NA 2 194 69 NA 22.590 3 66 57 NA -12.061 4 266 58 NA 1.639 5 NA 57 NA NA 6 286 78 NA 48.351 7 287 74 NA 39.294 8 242 67 NA 20.775 9 186 84 NA 56.310 10 220 85 NA 60.531 11 264 79 NA 49.373 12 273 87 NA 68.114 13 259 93 NA 80.986 14 250 92 NA 78.193 15 332 80 NA 55.535 ``` --- ## Try `simputation` with multiple regression ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/intro_simputation3-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Could add additional terms like `wind` ```r # impute data air_imp <- air2 %>% * simputation::impute_lm(ozone ~ solar_r + temp + wind) dim(air_imp) ``` ``` [1] 153 13 ``` ```r air_imp %>% select(solar_r, temp, wind, ozone, ozone_bivariate) %>% head(10) ``` ``` solar_r temp wind ozone ozone_bivariate 1 190 67 7.4 41.00 42.76 2 118 72 8.0 36.00 33.60 3 149 74 12.6 12.00 37.55 4 313 62 11.5 18.00 58.40 5 NA 56 14.3 NA NA 6 NA 66 14.9 28.00 NA 7 299 65 8.6 23.00 56.62 8 99 59 13.8 19.00 31.19 9 19 61 20.1 8.00 21.01 10 194 69 8.6 32.59 43.27 ``` --- ## Scatterplot with imputed ozone ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/intro_simputation5-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Some issues with multiple regression ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/multiple_imputation_problems1-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Some issues with multiple regression ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/multiple_imputation_problems2-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Some issues with multiple regression ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/multiple_imputation_problems3-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Also, some missingness still unimputed ```r air_imp %>% ggplot() + aes(x = solar_r, y = ozone, colour = ozone_NA) + geom_miss_point() ``` <img src="week12_01_files/figure-html/unnamed-chunk-45-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Many methods available with `simputation` - Model based (optionally add [non-]parametric random residual) - robust linear regression - ridge/elasticnet/lasso regression - CART models (decision trees) - Random forest - Multivariate imputation - Imputation based on the expectation-maximization algorithm - missForest (=iterative random forest imputation) - Donor imputation (including various donor pool specifications) - k-nearest neigbour (based on gower's distance) - sequential hotdeck (LOCF, NOCB) - random hotdeck - Predictive mean matching - Other - (groupwise) median imputation (optional random residual) - Proxy imputation: copy another variable or use a simple transformation to compute imputed values. .footnote[Source: https://cran.r-project.org/web/packages/simputation/vignettes/intro.html] --- class: center, middle # Missing Data: # Multiple Imputation --- ## Multiple imputation outperforms listwise deletion <img src="images/apsr_king_honaker_joseph_scheve.png" width="90%" style="display: block; margin: auto;" /> .footnote[King, Honaker, Joseph, Scheve (2001), "Analyzing Incomplete Political Science Data: An Alternative Algorithm for Multiple Imputation," *APSR*, https://doi.org/10.1017/S0003055401000235] --- ## Multiple Imputation via `amelia` <img src="images/amelia_multiple_imputation_diagram.png" width="75%" style="display: block; margin: auto;" /> .footnote[https://cran.r-project.org/web/packages/Amelia/vignettes/amelia.pdf] --- ## `amelia` accepts a matrix of bounds ```r names(airquality) ``` ``` [1] "ozone" "solar_r" "wind" "temp" "month" "day" ``` ```r # what are reasonable bounds for our imputations summary(airquality$ozone) ``` ``` Min. 1st Qu. Median Mean 3rd Qu. Max. NA's 1.0 18.0 31.5 42.1 63.2 168.0 37 ``` ```r summary(airquality$solar_r) ``` ``` Min. 1st Qu. Median Mean 3rd Qu. Max. NA's 7 116 205 186 259 334 7 ``` --- ## `amelia` accepts a matrix of bounds ```r bounds_matrix <- matrix( ncol = 3, byrow = TRUE, data = c(1, 1, 168, # bound col 1 (ozone), 1 to 168 2, 7, 334) # bound col 2 (solar_r), 7 to 334 ) bounds_matrix ``` ``` [,1] [,2] [,3] [1,] 1 1 168 [2,] 2 7 334 ``` --- ## `amelia` function takes a handful of arguments ```r ## see https://gking.harvard.edu/amelia for more air2 <- airquality %>% mutate(ozone_NA = ifelse(is.na(ozone), "NA", "!NA")) ignore_vars <- c("ozone_NA") air_amelia_out <- amelia( x = air2, # data set m = 5, # # of imputations, usually 5, in pol346 1 ok bounds = bounds_matrix, # tell amelia to bound some vars idvars = ignore_vars # vars to leave out of imputation ) ``` ``` -- Imputation 1 -- 1 2 3 4 5 6 7 8 -- Imputation 2 -- 1 2 3 4 5 -- Imputation 3 -- 1 2 3 4 5 6 7 -- Imputation 4 -- 1 2 3 4 5 6 7 -- Imputation 5 -- 1 2 3 4 5 6 ``` --- ## Selecting one of the multiply imputed data sets ```r ## amelia uses multiple imputation and creates 5 imputations ## code below access imputation 1 air_imp1 <- air_amelia_out$imputations$imp1 dim(air_imp1) ``` ``` [1] 153 7 ``` ```r head(air_imp1, 10) ``` ``` ozone solar_r wind temp month day ozone_NA 1 41.000 190.0 7.4 67 5 1 !NA 2 36.000 118.0 8.0 72 5 2 !NA 3 12.000 149.0 12.6 74 5 3 !NA 4 18.000 313.0 11.5 62 5 4 !NA 5 2.214 135.9 14.3 56 5 5 NA 6 28.000 226.2 14.9 66 5 6 !NA 7 23.000 299.0 8.6 65 5 7 !NA 8 19.000 99.0 13.8 59 5 8 !NA 9 8.000 19.0 20.1 61 5 9 !NA 10 67.530 194.0 8.6 69 5 10 NA ``` --- ## Look at imputed `airquality` missingness ```r ## view if any missingness remains naniar::gg_miss_var(air_imp1) ``` <img src="week12_01_files/figure-html/unnamed-chunk-48-1.png" width="432" style="display: block; margin: auto;" /> --- ## Look at imputed `airquality` missingness ```r naniar::vis_miss(air_imp1) ``` <img src="week12_01_files/figure-html/unnamed-chunk-49-1.png" width="432" style="display: block; margin: auto;" /> --- ## Scatter plot of one imputation from `amelia` ```r air_imp1 %>% ggplot() + aes(x = solar_r, y = ozone, color = ozone_NA) + geom_point() ``` <img src="week12_01_files/figure-html/amelia_imputation1-1.png" width="65%" style="display: block; margin: auto;" /> --- ## Can model with `lm` as usual ```r air_out <- lm(ozone ~ solar_r + temp + wind, data = air_imp1) summary(air_out) ``` ``` Call: lm(formula = ozone ~ solar_r + temp + wind, data = air_imp1) Residuals: Min 1Q Median 3Q Max -50.40 -13.68 -2.89 11.52 94.73 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -50.0950 18.7005 -2.68 0.0082 ** solar_r 0.0597 0.0198 3.01 0.0030 ** temp 1.4864 0.2080 7.15 3.7e-11 *** wind -3.3028 0.5373 -6.15 6.9e-09 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 *Residual standard error: 20.7 on 149 degrees of freedom Multiple R-squared: 0.572, Adjusted R-squared: 0.563 F-statistic: 66.3 on 3 and 149 DF, p-value: <2e-16 ``` --- ## Can `plot_model` as usual ```r plot_model(air_out, type = "pred", terms = "solar_r", show.data = TRUE) ``` <img src="week12_01_files/figure-html/amelia_imputation_plot_model-1.png" width="576" style="display: block; margin: auto;" /> --- class: center, middle # Questions? --- class: center, middle # Appendix --- ## Packages for missing data? - Motivating example: Airquality data - Packages: - `skimr` - "a frictionless approach to summary statistics" - `naniar` - "principled, tidy ways to summarise, visualise, and manipulate missing data with minimal deviations from the workflows in `ggplot2` and `tidy data" --- ## Packages for missing data? - `simputation` - offers a number of commonly used single imputation methods, each with a similar and hopefully simple interface" - `Amelia` - "multiply imputes' missing data in a single cross-section (such as a survey), from a time series (like variables collected for each year in a country), or from a time-series-cross-sectional data set (such as collected by years for each of several countries)" --- ## Packages for missing data? - `mi` - "provides functions for data manipulation, imputing missing values in an approximate Bayesian framework, diagnostics of the models used to generate the imputations, confidence-building mechanisms to validate some of the assumptions of the imputation algorithm, and functions to analyze multiply imputed data sets with the appropriate degree of sampling uncertainty." - `mice` - Multiple imputation using Fully Conditional Specification (FCS) implemented by the MICE algorithm as described in Van Buuren and Groothuis-Oudshoorn (2011). --- ## Packages for missing *panel* data? - `panelView` - "(1) it visualizes the treatment and missing-value statuses of each observation in a panel/time-series-cross-sectional (TSCS) dataset; and (2) it plots the outcome variable (either continuous or discrete) in a time-series fashion" - `Amelia` - "multiply imputes' missing data in a single cross-section (such as a survey), from a time series (like variables collected for each year in a country), or from a time-series-cross-sectional data set (such as collected by years for each of several countries)" --- ## Tutorials - Thomas Leeper: - "This tutorial covers techniques of multiple imputation" - https://thomasleeper.com/Rcourse/Tutorials/mi.html - Andrew Heiss: - "Meld regression output from multiple imputations with tidyverse" - https://www.andrewheiss.com/blog/2018/03/07/amelia-tidy-melding/ - UCLA: Multiple Imputation - https://stats.idre.ucla.edu/r/faq/how-do-i-perform-multiple-imputation-using-predictive-mean-matching-in-r/ --- ## `skim` (good interactively / html, not Latex / pdfs) .tiny[ ```r # another way to summarize data *skimr::skim(airquality) ``` <table style='width: auto;' class='table table-condensed'> <caption>Data summary</caption> <thead> <tr> <th style="text-align:left;"> </th> <th style="text-align:left;"> </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Name </td> <td style="text-align:left;"> airquality </td> </tr> <tr> <td style="text-align:left;"> Number of rows </td> <td style="text-align:left;"> 153 </td> </tr> <tr> <td style="text-align:left;"> Number of columns </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> _______________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Column type frequency: </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> numeric </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> ________________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Group variables </td> <td style="text-align:left;"> None </td> </tr> </tbody> </table> **Variable type: numeric** <table> <thead> <tr> <th style="text-align:left;"> skim_variable </th> <th style="text-align:right;"> n_missing </th> <th style="text-align:right;"> complete_rate </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> sd </th> <th style="text-align:right;"> p0 </th> <th style="text-align:right;"> p25 </th> <th style="text-align:right;"> p50 </th> <th style="text-align:right;"> p75 </th> <th style="text-align:right;"> p100 </th> <th style="text-align:left;"> hist </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> ozone </td> <td style="text-align:right;"> 37 </td> <td style="text-align:right;"> 0.76 </td> <td style="text-align:right;"> 42.13 </td> <td style="text-align:right;"> 32.99 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 18.0 </td> <td style="text-align:right;"> 31.5 </td> <td style="text-align:right;"> 63.25 </td> <td style="text-align:right;"> 168.0 </td> <td style="text-align:left;"> ▇▃▂▁▁ </td> </tr> <tr> <td style="text-align:left;"> solar_r </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 0.95 </td> <td style="text-align:right;"> 185.93 </td> <td style="text-align:right;"> 90.06 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 115.8 </td> <td style="text-align:right;"> 205.0 </td> <td style="text-align:right;"> 258.75 </td> <td style="text-align:right;"> 334.0 </td> <td style="text-align:left;"> ▅▃▅▇▅ </td> </tr> <tr> <td style="text-align:left;"> wind </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 9.96 </td> <td style="text-align:right;"> 3.52 </td> <td style="text-align:right;"> 1.7 </td> <td style="text-align:right;"> 7.4 </td> <td style="text-align:right;"> 9.7 </td> <td style="text-align:right;"> 11.50 </td> <td style="text-align:right;"> 20.7 </td> <td style="text-align:left;"> ▂▇▇▃▁ </td> </tr> <tr> <td style="text-align:left;"> temp </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 77.88 </td> <td style="text-align:right;"> 9.47 </td> <td style="text-align:right;"> 56.0 </td> <td style="text-align:right;"> 72.0 </td> <td style="text-align:right;"> 79.0 </td> <td style="text-align:right;"> 85.00 </td> <td style="text-align:right;"> 97.0 </td> <td style="text-align:left;"> ▂▃▇▇▃ </td> </tr> <tr> <td style="text-align:left;"> month </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 6.99 </td> <td style="text-align:right;"> 1.42 </td> <td style="text-align:right;"> 5.0 </td> <td style="text-align:right;"> 6.0 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 8.00 </td> <td style="text-align:right;"> 9.0 </td> <td style="text-align:left;"> ▇▇▇▇▇ </td> </tr> <tr> <td style="text-align:left;"> day </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 15.80 </td> <td style="text-align:right;"> 8.86 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 8.0 </td> <td style="text-align:right;"> 16.0 </td> <td style="text-align:right;"> 23.00 </td> <td style="text-align:right;"> 31.0 </td> <td style="text-align:left;"> ▇▇▇▇▆ </td> </tr> </tbody> </table> ] --- ## With Latex / pdfs use `skim_without_charts` .tiny[ ```r # another way to summarize data *skimr::skim_without_charts(airquality) ``` <table style='width: auto;' class='table table-condensed'> <caption>Data summary</caption> <thead> <tr> <th style="text-align:left;"> </th> <th style="text-align:left;"> </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Name </td> <td style="text-align:left;"> airquality </td> </tr> <tr> <td style="text-align:left;"> Number of rows </td> <td style="text-align:left;"> 153 </td> </tr> <tr> <td style="text-align:left;"> Number of columns </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> _______________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Column type frequency: </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> numeric </td> <td style="text-align:left;"> 6 </td> </tr> <tr> <td style="text-align:left;"> ________________________ </td> <td style="text-align:left;"> </td> </tr> <tr> <td style="text-align:left;"> Group variables </td> <td style="text-align:left;"> None </td> </tr> </tbody> </table> **Variable type: numeric** <table> <thead> <tr> <th style="text-align:left;"> skim_variable </th> <th style="text-align:right;"> n_missing </th> <th style="text-align:right;"> complete_rate </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> sd </th> <th style="text-align:right;"> p0 </th> <th style="text-align:right;"> p25 </th> <th style="text-align:right;"> p50 </th> <th style="text-align:right;"> p75 </th> <th style="text-align:right;"> p100 </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> ozone </td> <td style="text-align:right;"> 37 </td> <td style="text-align:right;"> 0.76 </td> <td style="text-align:right;"> 42.13 </td> <td style="text-align:right;"> 32.99 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 18.0 </td> <td style="text-align:right;"> 31.5 </td> <td style="text-align:right;"> 63.25 </td> <td style="text-align:right;"> 168.0 </td> </tr> <tr> <td style="text-align:left;"> solar_r </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 0.95 </td> <td style="text-align:right;"> 185.93 </td> <td style="text-align:right;"> 90.06 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 115.8 </td> <td style="text-align:right;"> 205.0 </td> <td style="text-align:right;"> 258.75 </td> <td style="text-align:right;"> 334.0 </td> </tr> <tr> <td style="text-align:left;"> wind </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 9.96 </td> <td style="text-align:right;"> 3.52 </td> <td style="text-align:right;"> 1.7 </td> <td style="text-align:right;"> 7.4 </td> <td style="text-align:right;"> 9.7 </td> <td style="text-align:right;"> 11.50 </td> <td style="text-align:right;"> 20.7 </td> </tr> <tr> <td style="text-align:left;"> temp </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 77.88 </td> <td style="text-align:right;"> 9.47 </td> <td style="text-align:right;"> 56.0 </td> <td style="text-align:right;"> 72.0 </td> <td style="text-align:right;"> 79.0 </td> <td style="text-align:right;"> 85.00 </td> <td style="text-align:right;"> 97.0 </td> </tr> <tr> <td style="text-align:left;"> month </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 6.99 </td> <td style="text-align:right;"> 1.42 </td> <td style="text-align:right;"> 5.0 </td> <td style="text-align:right;"> 6.0 </td> <td style="text-align:right;"> 7.0 </td> <td style="text-align:right;"> 8.00 </td> <td style="text-align:right;"> 9.0 </td> </tr> <tr> <td style="text-align:left;"> day </td> <td style="text-align:right;"> 0 </td> <td style="text-align:right;"> 1.00 </td> <td style="text-align:right;"> 15.80 </td> <td style="text-align:right;"> 8.86 </td> <td style="text-align:right;"> 1.0 </td> <td style="text-align:right;"> 8.0 </td> <td style="text-align:right;"> 16.0 </td> <td style="text-align:right;"> 23.00 </td> <td style="text-align:right;"> 31.0 </td> </tr> </tbody> </table> ]