Take Home Exercise 1

This article is about creating a Pareto chart and a pyramid

Published

Jan. 28, 2022

DOI

Task 1

A pareto chart showing the distribution of returns by product sub-category.

Installing and loading the required packages

packages = c('tidyverse', 'readxl', 'knitr')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
  }

data import

orders <- read_xls("data/Superstore-2021.xls",
                  sheet = "Orders")
returns <- read_xls("data/Superstore-2021.xls",
                  sheet = "Returns")

data wrangling

Joining the two data frames

joined_tab <- left_join(returns, orders,
                        by = c('Order ID' = 'Order ID'))

Compute the frequency count by Sub-category

The count method
freq_returned <- joined_tab %>% 
  count(`Sub-Category`) %>%
  rename(Returns = n)

Sorting data

freq_sorted <- freq_returned %>%
  arrange(desc(Returns))

Computing the cumulative frequency

freq_cum <- freq_sorted %>%
  mutate(cumfreq = cumsum(Returns))

Graphing the pareto chart

ggplot()+
  geom_bar(data = freq_cum, aes(x=reorder(freq_cum$'Sub-Category',-freq_cum$Returns), y=freq_cum$Returns),stat = 'identity', fill = 'light blue') +
  geom_point(data = freq_cum,
        aes(x=freq_cum$`Sub-Category`,y=freq_cum$cumfreq))+
  geom_line(data = freq_cum,
        aes(x=freq_cum$`Sub-Category`,y=freq_cum$cumfreq,group =1))+
  scale_y_continuous(
    name='absolute frequency',breaks = seq(0,4000,322.6),
    sec.axis = sec_axis(~(./sum(freq_cum$Returns)),breaks = seq(0,2,by=0.1),labels = scales::percent))+
  labs(title = 'Pareto Chart of return by sub-category', x='Sub-Category')+
  theme_bw(base_size = 7)

Task 2

A age-sex pyramid showing the demographic structure of Singapore by age cohort and gender.

data import

residents_stats<- read_csv("data/respopagesextod2021.csv")

data wrangling

Subset the dataframe

residents_data <- residents_stats[c(3,4,6)]

Arrange Age in ascending order

residents_data_ordered <- residents_stats %>%
  arrange(AG)

Groupby age and sex and sum the population of each group

residents_data_sorted <- residents_data_ordered %>% group_by(AG, Sex) %>%
                                        summarise(Pop = sum(Pop))

Graphing the pyramid

ggplot(residents_data_sorted, aes(x = AG, fill = Sex,
                                  y = ifelse(test = Sex =='Males',
                                          yes = -Pop, no = Pop))) +
      geom_bar(stat = 'identity') +
      coord_flip()+
    scale_y_continuous(name='Male versus Female Population',breaks = seq(-160000,160000,20000),labels =abs(seq(-160000,160000,20000)))+
  theme(axis.text.x=element_text(size=5))+
  labs(title = 'Age-Sex Pyramid of Singapore Population by Age Group and Sex', x='Age Group')

Comparision between programmatic visualization and GUI software(Tableau)

GUI software

Programmatic Visualization