Exploring numerical data

MP223 - Applied Econometrics Methods for the Social Sciences

Eduard Bukin

R setup

library(tidyverse)
library(alr4)
library(skimr)

# set default theme and larger font size for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 16))

# set default figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 8,
  fig.asp = 0.618,
  fig.retina = 3,
  dpi = 300,
  out.width = "80%"
)

The data in use

UN11 data set on National statistics from the United Nations mostly from 2009-2011 from (Weisberg 2005).

library(alr4)
un_dta <- UN11 %>% as_tibble()
glimpse(un_dta)
Rows: 199
Columns: 6
$ region    <fct> Asia, Europe, Africa, Africa, Caribbean, Latin Amer, Asia, C…
$ group     <fct> other, other, africa, africa, other, other, other, other, oe…
$ fertility <dbl> 5.968, 1.525, 2.142, 5.135, 2.000, 2.172, 1.735, 1.671, 1.94…
$ ppgdp     <dbl> 499.0, 3677.2, 4473.0, 4321.9, 13750.1, 9162.1, 3030.7, 2285…
$ lifeExpF  <dbl> 49.49, 80.40, 75.00, 53.17, 81.10, 79.89, 77.33, 77.75, 84.2…
$ pctUrban  <dbl> 23, 53, 67, 59, 100, 93, 64, 47, 89, 68, 52, 84, 89, 29, 45,…

Summary statistics

Summary of a single variables

un_dta %>% pull(fertility) %>% mean()
[1] 2.761383
mean(un_dta$fertility)
[1] 2.761383
un_dta %>% pull(fertility) %>% sd()
[1] 1.339589
sd(un_dta$fertility)
[1] 1.339589

Using dplyr::summarise() (1/3)

Using dplyr::summarise() (2/3)

un_dta %>%
  summarise(
    mean_fert = mean(fertility)
  ) 
# A tibble: 1 × 1
  mean_fert
      <dbl>
1      2.76

Using dplyr::summarise() (2/3)

un_dta %>%
  summarise(
    mean_fert = mean(fertility),
    sd_ppgdp = sd(ppgdp)
  ) 
# A tibble: 1 × 2
  mean_fert sd_ppgdp
      <dbl>    <dbl>
1      2.76   18412.

Using dplyr::summarise() (2/3)

un_dta %>%
  summarise(
    mean_fert = mean(fertility),
    sd_ppgdp = sd(ppgdp),
    med_lifeExpF = median(lifeExpF)
  ) 
# A tibble: 1 × 3
  mean_fert sd_ppgdp med_lifeExpF
      <dbl>    <dbl>        <dbl>
1      2.76   18412.         75.9

Using dplyr::summarise() (3/3)

un_dta %>%
  summarise(across(
    c(fertility),
    list(
      means = mean
    )
  )) %>% 
  t()
                    [,1]
fertility_means 2.761383

Using dplyr::summarise() (3/3)

un_dta %>%
  summarise(across(
    c(fertility, ppgdp),
    list(
      means = mean
    )
  )) %>% 
  t()
                        [,1]
fertility_means     2.761383
ppgdp_means     13011.951759

Using dplyr::summarise() (3/3)

un_dta %>%
  summarise(across(
    c(fertility, ppgdp),
    list(
      means = mean,
      medians = ~ median(., na.rm = TRUE)
    )
  )) %>% 
  t()
                          [,1]
fertility_means       2.761383
fertility_medians     2.262000
ppgdp_means       13011.951759
ppgdp_medians      4684.500000

Using dplyr::summarise() (3/3)

un_dta %>%
  summarise(across(
    c(fertility, ppgdp),
    list(
      means = mean,
      medians = ~ median(., na.rm = TRUE),
      sd = ~ sd(., na.rm = TRUE),
      n_nonmis = ~ sum(!is.na(.))
    )
  )) %>% 
  t()
                           [,1]
fertility_means        2.761383
fertility_medians      2.262000
fertility_sd           1.339589
fertility_n_nonmis   199.000000
ppgdp_means        13011.951759
ppgdp_medians       4684.500000
ppgdp_sd           18412.443368
ppgdp_n_nonmis       199.000000

Using dplyr::group_by() + dplyr::summarise() (1/3)

Using dplyr::group_by() + dplyr::summarise() (2/3)

un_dta %>% 
  # group_by(region) %>% 
  summarise(mean_fert = mean(fertility),
            sd_ppgdp = sd(ppgdp),
            med_lifeExpF = median(lifeExpF))
# A tibble: 1 × 3
  mean_fert sd_ppgdp med_lifeExpF
      <dbl>    <dbl>        <dbl>
1      2.76   18412.         75.9

Using dplyr::group_by() + dplyr::summarise() (2/3)

un_dta %>% 
  group_by(region) %>% 
  summarise(mean_fert = mean(fertility),
            sd_ppgdp = sd(ppgdp),
            med_lifeExpF = median(lifeExpF))
# A tibble: 8 × 4
  region        mean_fert sd_ppgdp med_lifeExpF
  <fct>             <dbl>    <dbl>        <dbl>
1 Africa             4.24    3614.         58.6
2 Asia               2.43   16742.         75.2
3 Caribbean          2.01   23063.         78.2
4 Europe             1.59   23820.         81.4
5 Latin Amer         2.43    3775.         77.6
6 North America      1.88     131.         82.4
7 NorthAtlantic      2.22      NA          71.6
8 Oceania            3.10   15956.         72.3

Using dplyr::group_by() + dplyr::summarise() (3/3)

un_dta %>%
  group_by(group) %>%
  summarise(across(
    c(fertility, ppgdp),
    list(
      means = mean,
      medians = ~ median(., na.rm = TRUE),
      sd = ~ sd(., na.rm = TRUE),
      n_nonmis = ~ sum(!is.na(.))
    )
  ))
# A tibble: 3 × 9
  group  fertility_means fertility_medians fertility_sd fertility_n_nonmis
  <fct>            <dbl>             <dbl>        <dbl>              <int>
1 oecd              1.77              1.79        0.340                 31
2 other             2.35              2.17        0.927                115
3 africa            4.24              4.42        1.30                  53
# … with 4 more variables: ppgdp_means <dbl>, ppgdp_medians <dbl>,
#   ppgdp_sd <dbl>, ppgdp_n_nonmis <int>

skimr package

Package for Summary statistics

skim() of the UN data

library(skimr)
un_dta %>% skim()
── Data Summary ────────────────────────
                           Values    
Name                       Piped data
Number of rows             199       
Number of columns          6         
_______________________              
Column type frequency:               
  factor                   2         
  numeric                  4         
________________________             
Group variables            None      

── Variable type: factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  skim_variable n_missing complete_rate ordered n_unique top_counts                        
1 region                0             1 FALSE          8 Afr: 53, Asi: 50, Eur: 39, Lat: 20
2 group                 0             1 FALSE          3 oth: 115, afr: 53, oec: 31        

── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  skim_variable n_missing complete_rate     mean       sd     p0     p25     p50      p75      p100 hist 
1 fertility             0             1     2.76     1.34   1.13    1.75    2.26     3.54      6.92 ▇▃▂▂▁
2 ppgdp                 0             1 13012.   18412.   115.   1283.   4684.   15520.   105095.   ▇▁▁▁▁
3 lifeExpF              0             1    72.3     10.1   48.1    65.7    75.9     79.6      87.1  ▂▂▂▇▅
4 pctUrban              0             1    57.9     23.4   11      39      59       75       100    ▅▆▇▇▆

skim() of the UN data by group

un_dta %>% group_by(group) %>% skim()
── Data Summary ────────────────────────
                           Values    
Name                       Piped data
Number of rows             199       
Number of columns          6         
_______________________              
Column type frequency:               
  factor                   1         
  numeric                  4         
________________________             
Group variables            group     

── Variable type: factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  skim_variable group  n_missing complete_rate ordered n_unique top_counts                        
1 region        oecd           0             1 FALSE          5 Eur: 22, Asi: 3, Lat: 2, Nor: 2   
2 region        other          0             1 FALSE          6 Asi: 47, Lat: 18, Car: 17, Eur: 17
3 region        africa         0             1 FALSE          1 Afr: 53, Asi: 0, Car: 0, Eur: 0   

── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable group  n_missing complete_rate     mean        sd      p0      p25      p50      p75      p100 hist 
 1 fertility     oecd           0             1     1.77     0.340    1.31     1.48     1.79     1.95      2.91 ▇▇▃▁▁
 2 fertility     other          0             1     2.35     0.927    1.13     1.65     2.17     2.61      5.97 ▇▆▂▁▁
 3 fertility     africa         0             1     4.24     1.30     1.59     3.17     4.42     5.08      6.92 ▅▃▇▅▂
 4 ppgdp         oecd           0             1 37761.   22092.    9101.   20138.   39546.   46453.   105095.   ▆▇▂▁▁
 5 ppgdp         other          0             1 11181.   15271.     499     2527.    5195.   12857.    92625.   ▇▁▁▁▁
 6 ppgdp         africa         0             1  2509.    3614.     115.     509      981.    2865     16852.   ▇▁▁▁▁
 7 lifeExpF      oecd           0             1    82.4      2.09    76.6     81.3     82.8     83.5      87.1  ▁▂▇▇▁
 8 lifeExpF      other          0             1    75.3      5.68    49.5     72.5     76.4     78.3      86.4  ▁▁▂▇▂
 9 lifeExpF      africa         0             1    59.8      8.69    48.1     53.1     58.6     63.8      78    ▇▆▅▁▃
10 pctUrban      oecd           0             1    75.8     11.7     49       68       78       85        97    ▁▅▃▇▂
11 pctUrban      other          0             1    60.2     24.0     13       44.5     60       76       100    ▅▃▇▇▆
12 pctUrban      africa         0             1    42.6     17.6     11       28       40       59        86    ▃▇▅▅▁

report package

Package for Summary statistics!

Important

This is an experimental package! It does not support some types of the variables and may break or produce an error.

report the UN data

# library(report)
# un_dta %>% report() %>% as_tibble()

report the UN data by group

# un_dta %>% group_by(group) %>% report() %>% as_tibble()

Univariate data: Boxplot

Boxplot: basics

Boxplot (1/4)

un_dta %>% 
  ggplot()

Boxplot (2/4)

un_dta %>% 
  ggplot() + 
  aes(y = fertility)

Boxplot (3/4)

un_dta %>% 
  ggplot() + 
  aes(y = fertility) + 
  geom_boxplot()

Boxplot (4/4)

un_dta %>% 
  ggplot() + 
  aes(y = fertility) + 
  geom_boxplot() + 
  labs(title = "Boxplot of fertility",
       subtitle = "Based on UN country level data for 2010-2015",
       y = "Fertility, children per woman",
       x = "")

Boxplot (4/4)

un_dta %>% 
  ggplot() + 
  aes(y = fertility) + 
  geom_boxplot()

Boxplot by groups (1/3)

un_dta %>% 
  ggplot() + 
  aes(y = fertility, x = region) + 
  geom_boxplot()

Boxplot by groups (2/3)

un_dta %>% 
  ggplot() + 
  aes(y = fertility, x = region, colour = region) + 
  geom_boxplot()

Boxplot by groups (3/3)

un_dta %>% 
  ggplot() + 
  aes(y = region, x = fertility, colour = region) + 
  geom_boxplot() + 
  theme(legend.position = "none")

Causal question on boxplot!

Does a country group has a causal effect on fertility?

Code
un_dta %>% 
  ggplot() + 
  aes(y = group, x = fertility, colour = group) + 
  geom_boxplot() + 
  theme(legend.position = "none")

Univariate data: Histogram

Histogram: basics

Important

Check the gallery https://r-graph-gallery.com/histogram.html;

Learn about histograms here: THE BOXPLOT AND ITS PITFALLS

Histogramus simplicius

un_dta 
# A tibble: 199 × 6
   region     group  fertility  ppgdp lifeExpF pctUrban
   <fct>      <fct>      <dbl>  <dbl>    <dbl>    <dbl>
 1 Asia       other       5.97   499      49.5       23
 2 Europe     other       1.52  3677.     80.4       53
 3 Africa     africa      2.14  4473      75         67
 4 Africa     africa      5.14  4322.     53.2       59
 5 Caribbean  other       2    13750.     81.1      100
 6 Latin Amer other       2.17  9162.     79.9       93
 7 Asia       other       1.74  3031.     77.3       64
 8 Caribbean  other       1.67 22852.     77.8       47
 9 Oceania    oecd        1.95 57119.     84.3       89
10 Europe     oecd        1.35 45159.     83.6       68
# … with 189 more rows

Histogramus simplicius

un_dta %>% 
  ggplot()

Histogramus simplicius

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp)

Histogramus simplicius

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp) + 
  geom_histogram() +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: bins

Histogram: bins

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp) + 
  geom_histogram(bins = 50) +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: bins

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp) + 
  geom_histogram(bins = 10) +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: bins

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp) + 
  geom_histogram(bins = 3) +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram by group

Histogram by group

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_histogram(bins = 10, colour = "black") +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram by group

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_histogram(bins = 10, colour = "black", position = "dodge") +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram by group

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_histogram(bins = 10, colour = "black") +
  facet_grid(group ~ ., scales = "free_y") +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: variables transformation

Histogram: x transformation (1)

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_histogram(bins = 5, colour = "black") +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: x transformation (1)

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_histogram(bins = 5, colour = "black") +
  scale_x_log10() +
  labs(x = "GDP per capita, 2010 USD", y = "Frequency")

Histogram: x transformation (1)

un_dta %>%
  mutate(log_ppgdp = log10(ppgdp)) %>% 
  ggplot() + 
  aes(x = log_ppgdp, fill = group) + 
  geom_histogram(bins = 5, colour = "black") +
  labs(x = "Log of GDP per capita (log base 10), 2010 USD", y = "Frequency")

Histogram: x transformation (1)

Warning

What is special about bins width, when we apply a transformation to x?

Important

Bins width start to vary, when we transform the data.

This is opposed to the fixed bins width when data is not transformed.

Histogram: x transformation (1)

Min, max and width of bins:

No transformation

       xmin      xmax     diff
1  -5832.26   5832.26 11664.51
2   5832.26  17496.77 11664.51
3  17496.77  29161.28 11664.51
4  29161.28  40825.79 11664.51
5  40825.79  52490.30 11664.51
6  52490.30  64154.81 11664.51
7  64154.81  75819.32 11664.51
8  75819.32  87483.83 11664.51
9  87483.83  99148.34 11664.51
10 99148.34 110812.86 11664.51

log10() transformation

       xmin      xmax     diff
1     64.55    137.71    73.16
2    137.71    293.79   156.08
3    293.79    626.77   332.98
4    626.77   1337.14   710.37
5   1337.14   2852.65  1515.51
6   2852.65   6085.83  3233.18
7   6085.83  12983.49  6897.66
8  12983.49  27698.91 14715.42
9  27698.91  59092.73 31393.82
10 59092.73 126068.14 66975.41

Density plot

Density plot: basics

Important

Density plot: example

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_density(alpha = 0.5) +
  scale_x_log10() +
  labs(x = "GDP per capita, 2010 USD", y = "Density")

Density plot: adjust

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_density(alpha = 0.5, adjust = 0.1) +
  scale_x_log10() +
  labs(x = "GDP per capita, 2010 USD", y = "Density")

Density plot: adjust

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_density(alpha = 0.5, adjust = 1) +
  scale_x_log10() +
  labs(x = "GDP per capita, 2010 USD", y = "Density")

Density plot: adjust

un_dta %>% 
  ggplot() + 
  aes(x = ppgdp, fill = group) + 
  geom_density(alpha = 0.5, adjust = 10) +
  scale_x_log10() +
  labs(x = "GDP per capita, 2010 USD", y = "Density")

Bivariate data: Scatter plot

Scatter plot: basics

Important

Simple scatter plot

un_dta
# A tibble: 199 × 6
   region     group  fertility  ppgdp lifeExpF pctUrban
   <fct>      <fct>      <dbl>  <dbl>    <dbl>    <dbl>
 1 Asia       other       5.97   499      49.5       23
 2 Europe     other       1.52  3677.     80.4       53
 3 Africa     africa      2.14  4473      75         67
 4 Africa     africa      5.14  4322.     53.2       59
 5 Caribbean  other       2    13750.     81.1      100
 6 Latin Amer other       2.17  9162.     79.9       93
 7 Asia       other       1.74  3031.     77.3       64
 8 Caribbean  other       1.67 22852.     77.8       47
 9 Oceania    oecd        1.95 57119.     84.3       89
10 Europe     oecd        1.35 45159.     83.6       68
# … with 189 more rows

Simple scatter plot

un_dta %>%
  ggplot()

Simple scatter plot

un_dta %>%
  ggplot() +
  aes(x = lifeExpF, y = fertility) 

Simple scatter plot

un_dta %>%
  ggplot() +
  aes(x = lifeExpF, y = fertility) +
  geom_point()

Simple scatter plot

un_dta %>%
  ggplot() +
  aes(x = lifeExpF, y = fertility) +
  geom_point() +
  labs(x = "Life expectancy of wemen at birth", y = "Fertility, children per woman")

Scatter plot: make it rich with colour

un_dta %>%
  ggplot() +
  aes(x = lifeExpF, y = fertility, color = group) +
  geom_point() +
  labs(x = "Life expectancy of wemen at birth", y = "Fertility, children per woman", color = "")

Scatter plot: make it rich with size

un_dta %>%
  ggplot() +
  aes(x = lifeExpF, y = fertility, color = group, size = ppgdp) +
  geom_point() +
  labs(x = "Life expectancy of wemen at birth", y = "Fertility, children per woman", color = "", size = "GDP/cap.")

Takeaway

Takeaway

Summary statistics:

  • Extracting one variable with pull() and $;
  • dplyr::summarise() + dplyr::group_by();
  • skimr + report;

Boxplot: geom_boxplot(), simple and by groups;

Histogram: geom_histogram(), bins, scale_x_log10()

  • bins is important as it changes a perspective;
  • data transformation changes bins width;

Scatter plot:

  • Create plots with rich visual details;
  • Be clear and explicit with labels;

References

References

Weisberg, Sanford. 2005. Applied Linear Regression. John Wiley & Sons, Inc. https://doi.org/10.1002/0471704091.