2. Overall Statistics
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Overall number of artists across all museums:
artists %>% summarize(size=n())
# Number of artists in each museum:
artists %>% group_by(museum) %>% summarize(size=n())
#Overall unique number of artists, after removing duplicates:
artists.unique <- artists %>% distinct(artist, .keep_all = TRUE)
artists.unique %>% summarize(size=n())
### Overall statistics
# Gender Distribution:
table(artists.unique$gender, useNA="always")
##
## man woman <NA>
## 7086 1025 1077
round(prop.table(table(artists.unique$gender)),3)
##
## man woman
## 0.874 0.126
#overall gender score confidence interval:
prop.test(1025, 1025+7086, correct=FALSE)
##
## 1-sample proportions test without continuity correction
##
## data: 1025 out of 1025 + 7086, null probability 0.5
## X-squared = 4529.1, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1193170 0.1337799
## sample estimates:
## p
## 0.1263716
# Gender Distribution when only including artists born after 1945 and of North American origin:
gender.NorthAmerica.1945 <- artists.unique %>% filter(year>=1945, GEO3major=="North America") %>% select(gender)
table(gender.NorthAmerica.1945,useNA="always")
## gender.NorthAmerica.1945
## man woman <NA>
## 583 248 76
round(prop.table(table(gender.NorthAmerica.1945)),3)
## gender.NorthAmerica.1945
## man woman
## 0.702 0.298
# Ethnicity Distribution:
table(artists.unique$ethnicity, useNA="always")
##
## asian black hispanic other white <NA>
## 668 91 210 109 6315 1795
round(prop.table(table(artists.unique$ethnicity)),3)
##
## asian black hispanic other white
## 0.090 0.012 0.028 0.015 0.854
# Simultaneous Score Confidence Intervals:
nums <- unlist(table(artists.unique$ethnicity))
sapply(nums, function(x) prop.test(x, sum(nums), correct=FALSE, conf.level = 1-0.05/5)$conf.int)
## asian black hispanic other white
## [1,] 0.08213049 0.009415734 0.02383561 0.01154363 0.8432962
## [2,] 0.09931561 0.016076726 0.03382059 0.01881394 0.8644414
# Ethnicity Distribution when only including artists born after 1945 and of North American origin:
ethnicity.NorthAmerica.1945 <- artists.unique %>% filter(year>=1945, GEO3major=="North America") %>% select(ethnicity)
table(ethnicity.NorthAmerica.1945,useNA="always")
## ethnicity.NorthAmerica.1945
## asian black hispanic other white <NA>
## 4 28 7 21 664 183
round(prop.table(table(ethnicity.NorthAmerica.1945)),3)
## ethnicity.NorthAmerica.1945
## asian black hispanic other white
## 0.006 0.039 0.010 0.029 0.917
# Gender & Ethnicity Distribution:
table(artists.unique$gender, artists.unique$ethnicity, useNA="always")
##
## asian black hispanic other white <NA>
## man 510 62 173 61 5121 1159
## woman 35 26 20 29 732 183
## <NA> 123 3 17 19 462 453
round(addmargins(prop.table(table(artists.unique$gender, artists.unique$ethnicity))),3)
##
## asian black hispanic other white Sum
## man 0.075 0.009 0.026 0.009 0.757 0.876
## woman 0.005 0.004 0.003 0.004 0.108 0.124
## Sum 0.081 0.013 0.029 0.013 0.865 1.000
# Geographical Region:
table(artists.unique$GEO3major, useNA="always")
##
## Africa Asia and the Pacific
## 29 661
## Europe Latin America and the Caribbean
## 3329 162
## North America West Asia
## 3376 7
## <NA>
## 1624
round(prop.table(table(artists.unique$GEO3major)),3)
##
## Africa Asia and the Pacific
## 0.004 0.087
## Europe Latin America and the Caribbean
## 0.440 0.021
## North America West Asia
## 0.446 0.001
# Birth Decade
mean(artists.unique$year, na.rm=TRUE)
## [1] 1863.428
3.2 Ethnicity
ethndf <- artists %>% select(museum, ethnicity) %>% group_by(museum) %>%
summarize(asian=sum(ethnicity=="asian", na.rm=TRUE),
black=sum(ethnicity=="black", na.rm=TRUE),
hispanic=sum(ethnicity=="hispanic", na.rm=TRUE),
other=sum(ethnicity=="other", na.rm=TRUE),
white=sum(ethnicity=="white", na.rm=TRUE),
total=asian+black+hispanic+other+white
)
ethndf
3.2.1 Asian
ethndf.asian <- ethndf %>% select(museum, asian, total) %>% group_by(museum) %>%
mutate(prop=asian/total,
LB=prop.test(asian, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1],
UB=prop.test(asian, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
)
ethndf.asian$padj <- NA
for (i in 1:18) {
ethndf.asian$padj[i] <- prop.test(c(ethndf.asian$asian[i], sum(ethndf.asian$asian[-i])), c(ethndf.asian$total[i], sum(ethndf.asian$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.asian
3.2.2 Black
ethndf.black <- ethndf %>% select(museum, black, total) %>% group_by(museum) %>%
mutate(prop=black/total,
LB=prop.test(black, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1],
UB=prop.test(black, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
)
ethndf.black$padj <- NA
for (i in 1:18) {
ethndf.black$padj[i] <- prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-i])), c(ethndf.black$total[i], sum(ethndf.black$total[-i])), correct=FALSE)$p.value * (18*5)
}
## Warning in prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-
## i])), : Chi-squared approximation may be incorrect
## Warning in prop.test(c(ethndf.black$black[i], sum(ethndf.black$black[-
## i])), : Chi-squared approximation may be incorrect
ethndf.black
3.2.3 Hispanic
ethndf.hispanic <- ethndf %>% select(museum, hispanic, total) %>% group_by(museum) %>%
mutate(prop=hispanic/total,
LB=prop.test(hispanic, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1],
UB=prop.test(hispanic, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
)
ethndf.hispanic$padj <- NA
for (i in 1:18) {
ethndf.hispanic$padj[i] <- prop.test(c(ethndf.hispanic$hispanic[i], sum(ethndf.hispanic$hispanic[-i])), c(ethndf.hispanic$total[i], sum(ethndf.hispanic$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.hispanic
3.2.4 White
ethndf.white <- ethndf %>% select(museum, white, total) %>% group_by(museum) %>%
mutate(prop=white/total,
LB=prop.test(white, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1],
UB=prop.test(white, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
)
ethndf.white$padj <- NA
for (i in 1:18) {
ethndf.white$padj[i] <- prop.test(c(ethndf.white$white[i], sum(ethndf.white$white[-i])), c(ethndf.white$total[i], sum(ethndf.white$total[-i])), correct=FALSE)$p.value * (18*5)
}
ethndf.white
3.2.5 Other
ethndf.other <- ethndf %>% select(museum, other, total) %>% group_by(museum) %>%
mutate(prop=other/total,
LB=prop.test(other, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[1],
UB=prop.test(other, total, correct=FALSE, conf.level = 1-0.05/(18*5))$conf.int[2]
)
ethndf.other$padj <- NA
for (i in 1:18) {
ethndf.other$padj[i] <- prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-i])), c(ethndf.other$total[i], sum(ethndf.other$total[-i])), correct=FALSE)$p.value * (18*5)
}
## Warning in prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-
## i])), : Chi-squared approximation may be incorrect
## Warning in prop.test(c(ethndf.other$other[i], sum(ethndf.other$other[-
## i])), : Chi-squared approximation may be incorrect
ethndf.other