1. Load Artist Datafile

path <- 'https://raw.githubusercontent.com/artofstat/ArtistDiversity/master/artistdata.csv'
artists <- read.csv(path)

2. Overall Statistics

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Overall number of artists across all museums:
artists %>%  summarize(size=n()) 
# Number of artists in each museum: 
artists %>%  group_by(museum) %>% summarize(size=n()) 
#Overall unique number of artists, after removing duplicates:
artists.unique <- artists %>% distinct(artist, .keep_all = TRUE)
artists.unique %>%  summarize(size=n()) 
### Overall statistics
# Gender Distribution:
table(artists.unique$gender, useNA="always")
## 
##   man woman  <NA> 
##  7086  1025  1077
round(prop.table(table(artists.unique$gender)),3)
## 
##   man woman 
## 0.874 0.126
#overall gender score confidence interval:
prop.test(1025, 1025+7086, correct=FALSE)
## 
##  1-sample proportions test without continuity correction
## 
## data:  1025 out of 1025 + 7086, null probability 0.5
## X-squared = 4529.1, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1193170 0.1337799
## sample estimates:
##         p 
## 0.1263716
# Gender Distribution when only including artists born after 1945 and of North American origin:
gender.NorthAmerica.1945 <- artists.unique %>% filter(year>=1945, GEO3major=="North America") %>% select(gender)
table(gender.NorthAmerica.1945,useNA="always")
## gender.NorthAmerica.1945
##   man woman  <NA> 
##   583   248    76
round(prop.table(table(gender.NorthAmerica.1945)),3)
## gender.NorthAmerica.1945
##   man woman 
## 0.702 0.298
# Ethnicity Distribution:
table(artists.unique$ethnicity, useNA="always")
## 
##    asian    black hispanic    other    white     <NA> 
##      668       91      210      109     6315     1795
round(prop.table(table(artists.unique$ethnicity)),3)
## 
##    asian    black hispanic    other    white 
##    0.090    0.012    0.028    0.015    0.854
# Simultaneous Score Confidence Intervals:
nums <- unlist(table(artists.unique$ethnicity))
sapply(nums, function(x) prop.test(x, sum(nums), correct=FALSE, conf.level = 1-0.05/5)$conf.int)
##           asian       black   hispanic      other     white
## [1,] 0.08213049 0.009415734 0.02383561 0.01154363 0.8432962
## [2,] 0.09931561 0.016076726 0.03382059 0.01881394 0.8644414
# Ethnicity Distribution when only including artists born after 1945 and of North American origin:
ethnicity.NorthAmerica.1945 <- artists.unique %>% filter(year>=1945, GEO3major=="North America") %>% select(ethnicity)
table(ethnicity.NorthAmerica.1945,useNA="always")
## ethnicity.NorthAmerica.1945
##    asian    black hispanic    other    white     <NA> 
##        4       28        7       21      664      183
round(prop.table(table(ethnicity.NorthAmerica.1945)),3)
## ethnicity.NorthAmerica.1945
##    asian    black hispanic    other    white 
##    0.006    0.039    0.010    0.029    0.917
# Gender & Ethnicity Distribution:
table(artists.unique$gender, artists.unique$ethnicity, useNA="always")
##        
##         asian black hispanic other white <NA>
##   man     510    62      173    61  5121 1159
##   woman    35    26       20    29   732  183
##   <NA>    123     3       17    19   462  453
round(addmargins(prop.table(table(artists.unique$gender, artists.unique$ethnicity))),3)
##        
##         asian black hispanic other white   Sum
##   man   0.075 0.009    0.026 0.009 0.757 0.876
##   woman 0.005 0.004    0.003 0.004 0.108 0.124
##   Sum   0.081 0.013    0.029 0.013 0.865 1.000
# Geographical Region:
table(artists.unique$GEO3major, useNA="always")
## 
##                          Africa            Asia and the Pacific 
##                              29                             661 
##                          Europe Latin America and the Caribbean 
##                            3329                             162 
##                   North America                       West Asia 
##                            3376                               7 
##                            <NA> 
##                            1624
round(prop.table(table(artists.unique$GEO3major)),3)
## 
##                          Africa            Asia and the Pacific 
##                           0.004                           0.087 
##                          Europe Latin America and the Caribbean 
##                           0.440                           0.021 
##                   North America                       West Asia 
##                           0.446                           0.001
# Birth Decade
mean(artists.unique$year, na.rm=TRUE)
## [1] 1863.428

3. Museum Specific Analysis

3.1 Gender

genderdf <- artists %>% select(museum, gender) %>% group_by(museum) %>% 
summarize(men=sum(gender=="man", na.rm=TRUE), 
          women=sum(gender=="woman", na.rm=TRUE),
          total=men+women,
          prop.women=women/total, 
          LB=prop.test(women,total, correct=FALSE, conf.level = 1-0.05/18)$conf.int[1], 
          UB=prop.test(women,total, correct=FALSE, conf.level = 1-0.05/18)$conf.int[2]
          )
genderdf$padj <- NA
for (i in 1:18) {
  genderdf$padj[i] <- prop.test(c(genderdf$women[i], sum(genderdf$women[-i])), c(genderdf$total[i], sum(genderdf$total[-i])), correct=FALSE)$p.value * 18
}
genderdf

3.2 Ethnicity

ethndf <- artists %>% select(museum, ethnicity) %>% group_by(museum) %>% 
  summarize(asian=sum(ethnicity=="asian", na.rm=TRUE), 
            black=sum(ethnicity=="black", na.rm=TRUE), 
            hispanic=sum(ethnicity=="hispanic", na.rm=TRUE), 
            other=sum(ethnicity=="other", na.rm=TRUE), 
            white=sum(ethnicity=="white", na.rm=TRUE), 
            total=asian+black+hispanic+other+white
            )
ethndf

3.2.1 Asian

ethndf.asian <- ethndf %>% select(museum, asian, total) %>% group_by(museum) %>%
  mutate(prop=asian/total, 
         LB=prop.test(asian, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1], 
         UB=prop.test(asian, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
         )
ethndf.asian$padj <- NA
for (i in 1:18) {
  ethndf.asian$padj[i] <- prop.test(c(ethndf.asian$asian[i], sum(ethndf.asian$asian[-i])), c(ethndf.asian$total[i], sum(ethndf.asian$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.asian

3.2.2 Black

ethndf.black <- ethndf %>% select(museum, black, total) %>% group_by(museum) %>%
  mutate(prop=black/total, 
         LB=prop.test(black, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1], 
         UB=prop.test(black, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
         )
ethndf.black$padj <- NA
for (i in 1:18) {
  ethndf.black$padj[i] <- prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-i])), c(ethndf.black$total[i], sum(ethndf.black$total[-i])), correct=FALSE)$p.value * (18*5)
}
## Warning in prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-
## i])), : Chi-squared approximation may be incorrect

## Warning in prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-
## i])), : Chi-squared approximation may be incorrect
ethndf.black

3.2.3 Hispanic

ethndf.hispanic <- ethndf %>% select(museum, hispanic, total) %>% group_by(museum) %>%
  mutate(prop=hispanic/total, 
         LB=prop.test(hispanic, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1], 
         UB=prop.test(hispanic, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
         )
ethndf.hispanic$padj <- NA
for (i in 1:18) {
  ethndf.hispanic$padj[i] <- prop.test(c(ethndf.hispanic$hispanic[i], sum(ethndf.hispanic$hispanic[-i])), c(ethndf.hispanic$total[i], sum(ethndf.hispanic$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.hispanic

3.2.4 White

ethndf.white <- ethndf %>% select(museum, white, total) %>% group_by(museum) %>%
  mutate(prop=white/total, 
         LB=prop.test(white, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1], 
         UB=prop.test(white, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
         )
ethndf.white$padj <- NA
for (i in 1:18) {
  ethndf.white$padj[i] <- prop.test(c(ethndf.white$white[i], sum(ethndf.white$white[-i])), c(ethndf.white$total[i], sum(ethndf.white$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.white

3.2.5 Other

ethndf.other <- ethndf %>% select(museum, other, total) %>% group_by(museum) %>%
  mutate(prop=other/total, 
         LB=prop.test(other, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1], 
         UB=prop.test(other, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
         )
ethndf.other$padj <- NA
for (i in 1:18) {
  ethndf.other$padj[i] <- prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-i])), c(ethndf.other$total[i], sum(ethndf.other$total[-i])), correct=FALSE)$p.value * (18*5)
}
## Warning in prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-
## i])), : Chi-squared approximation may be incorrect

## Warning in prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-
## i])), : Chi-squared approximation may be incorrect
ethndf.other

3.3 Geographic Origin

geodf <- artists %>% select(museum, GEO3major) %>% group_by(museum) %>% 
  summarize(Africa=round(100*prop.table(table(GEO3major))[1],1), 
            Asia=round(100*prop.table(table(GEO3major))[2],1), 
            Europe=round(100*prop.table(table(GEO3major))[3],1), 
            LatAm=round(100*prop.table(table(GEO3major))[4],1), 
            NorthAm=round(100*prop.table(table(GEO3major))[5],1), 
            WAsia=round(100*prop.table(table(GEO3major))[6],1)
  )
geodf

3.4 Birth Year

yeardf <- artists %>% select(museum, year) %>% group_by(museum) %>% 
  summarize(Avg.Year=round(mean(year, na.rm=TRUE)))
yeardf