forked from daniel-wells/colleges-comparison
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Oxford_Colleges.Rmd
242 lines (187 loc) · 11.7 KB
/
Oxford_Colleges.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
---
title: "Oxford College Comparison"
output: github_document
---
Oxford University is made up of 38 self governing colleges (and 6 Permanent Private Halls). A college is where students live, eat, and attend tutorials (small group teaching). Each college has its own character and it can be quite bewildering to get a feel for what each college is like in order to choose which one to apply to without actually studying there. This analysis uses basic clustering and dimensionality reduction techniques to group together similar colleges and provide a high level overview of their characteristics. Code for this analysis can be found in the [.Rmd file](Oxford_Colleges.Rmd).
```{r load_data, echo=FALSE, message=FALSE, warning=FALSE}
library(knitr)
opts_chunk$set(echo=FALSE,fig.width=10, fig.asp=1, fig.retina=2, message=FALSE, warning=FALSE)
library(data.table, quietly = TRUE) # tidying data
library(ggplot2) # plotting
library(ggrepel) # labels on points
library(NMF) # heatmaps
nmf.options(grid.patch=TRUE) # stop blank pages appearing
library(dendextend) # dendograms
library(RColorBrewer) # colour pallet for dendogram
library(reshape2) # see norrington
# Regex for precleaning:
#\[[0-9]+\]
#College: [0-9]+
# Data source:
# https://en.wikipedia.org/wiki/Colleges_of_the_University_of_Oxford
# cross checked with:
# https://public.tableau.com/views/UniversityofOxford-StudentStatistics/CollegeOneYearOverview
oxford_meta <- fread("data/ox_colleges.csv")
oxford_meta <- oxford_meta[Name != "Total"]
oxford_meta$Year_of_Foundation <- as.integer(oxford_meta$Foundation)
oxford_meta$Assets <- as.integer(oxford_meta$Assets)
oxford_meta$Endowment <- as.integer(oxford_meta$Endowment)
oxford_meta$Assets_per_student <- as.integer(oxford_meta$Assets_per_student)
# http://www.chooseoxfordcollege.co.uk/about/data-sources.php
# http://oxpoints.oucs.ox.ac.uk/type/College.kml
# http://www.convertcsv.com/kml-to-csv.htm
oxford_kml <- fread("data/ox_college_locations.csv")
# https://www.ox.ac.uk/media/global/wwwoxacuk/localsites/gazette/documents/statisticalinformation/admissionsstatistics/Admissions_Statistics_2013.pdf
oxford_state <- fread("data/ox_state_private.csv")
# https://www.ox.ac.uk/about/facts-and-figures/admissions-statistics/undergraduate/additional-info/college-success-rates
oxford_acceptance <- fread("data/college_acceptance_rates.csv")
# https://public.tableau.com/views/UniversityofOxford-StudentStatistics/CollegeOneYearOverview
nationalities <- fread("data/Nationality.csv")
# Data from https://www.ox.ac.uk/about/facts-and-figures/undergraduate-degree-classifications
norrington_mean <- fread("data/Norrington_mean.tsv")
setnames(norrington_mean, "College", "Name")
# https://en.wikipedia.org/wiki/Eights_Week
eights_headships <- fread("data/Eights_Headships.csv")
setkey(oxford_meta, Name)
setkey(oxford_kml, Name)
setkey(oxford_state, Name)
setkey(oxford_acceptance, Name)
setkey(nationalities, Name)
setkey(norrington_mean, Name)
setkey(eights_headships, Name)
oxford_meta <- oxford_kml[, .(Name, longitude, latitude)][oxford_meta]
oxford_meta <- oxford_state[oxford_meta]
oxford_meta <- oxford_acceptance[, .(Name, percent_accepted)][oxford_meta]
oxford_meta <- nationalities[, .(Name, UK_percent)][oxford_meta]
oxford_meta <- norrington_mean[oxford_meta]
oxford_meta <- eights_headships[oxford_meta]
setnames(oxford_meta,
c("Female","UK_percent","Norrington_Score","Eights_Headships","longitude","latitude","percent_accepted","State_acceptance_rate"),
c("Proportion_Female","Proportion_UK_Nationality","Academic_Performance","Rowing_Performance","Longitude","Latitude","Application_Sucess_Rate","Proportion_State_School"))
oxford_meta$Name <- gsub(" College", "", oxford_meta$Name)
```
```{r, include=FALSE}
# https://www.ox.ac.uk/admissions/undergraduate/colleges/college-rent-and-other-charges?wssl=1
# http://apply.ousu.org/colleges/compare/undergraduate-accommodation/
#costs <- fread("data/College_costs.csv")
#genders <- fread("data/Gender.csv")
#student_numbers <- fread("data/Student_Numbers.csv")
#setkey(student_numbers, Name)
#setkey(genders, Name)
#setkey(costs, Name)
#oxford_meta <- student_numbers[,.(Name, Total, Undergraduate_percent)][oxford_meta]
#oxford_meta <- genders[,.(Name, Female_percent)][oxford_meta]
#costs[,number_rooms := On_site_en_suite + Off_site_en_suite + Off_site_not_en + On_site_not_en]
#costs[,ensuite_percent := (On_site_en_suite+Off_site_en_suite) / number_rooms]
#oxford_meta <- costs[,.(Name, Accom_meal_cost, Accom_cost, number_rooms, ensuite_percent)][oxford_meta]
```
```{r plot_location, fig.height=9, fig.width=9, include=FALSE}
# check location variables
ggplot(oxford_meta, aes(Longitude, Latitude)) +
geom_point() +
geom_label_repel(aes(label = Name), size = 2)
```
First let's do a hierarchical clustering and display it as a heat map. You can see the features I selected as columns (academic performance is the mean norrington score, and rowing performance is the number of headships in summer eights regatta). There is quite a good clustering of similar colleges (postgraduate colleges are together, prestigious colleges are together etc.).
```{r hierarchical_clustering, fig.width=10, fig.asp=1}
matrix_ox <- as.matrix(oxford_meta[, .(Year_of_Foundation, Endowment, Total_students, Proportion_Postgrads = Postgraduates / Total_students, Proportion_Female, Longitude, Latitude, Application_Sucess_Rate, Proportion_State_School, Proportion_UK_Nationality, Academic_Performance, Rowing_Performance)])
scale_matrix_ox <- scale(matrix_ox, T, T)
rownames(scale_matrix_ox) <- oxford_meta$Name
aheatmap(scale_matrix_ox, hclustfun="ward.D2", na.color = "grey")
```
Let's take a closer look at the dendrogram on the left which provides a sort of 'family tree' of the colleges showing which colleges are closely 'related' i.e. have similar characteristics. The first split is between the graduate only colleges (green) and the rest. Then in orange and purple we have the group of colleges generally considered the most prestigious (old, large endowment, pretty architecture). All Souls is an outlier in that it has only 6 graduate humanities students in the whole college, but it is very prestigious and so clusters better with the mixed undergraduate-graduate colleges in orange rather than the other graduate only colleges. In blue and pink are the most relaxed colleges which are generally newer and located further from the center. In yellow are the rest of the colleges.
```{r dendrogram, fig.width=10, fig.asp=1}
clustering <- hclust(dist(scale_matrix_ox), "ward.D2")
# Create coloured dendrogram
college_dendrogram <- as.ggdend(color_branches(clustering, k = 6, col = brewer.pal(6, name = "Accent")))
# Plot dendogram
ggplot(college_dendrogram, horiz = TRUE, offset_labels = -0.2)
```
Many of the features used in the above analysis provide redundant information, e.g. the percentage of postgrads is highly negatively correlated with the percentage of students of UK nationality. Below is the correlation matrix for each feature.
```{r correlation_matrix, fig.width=10, fig.asp=1}
# impute missing values using the mean
scale_matrix_ox[is.na(scale_matrix_ox)] <- 0
# View correlation matrix
aheatmap(cor(scale_matrix_ox), breaks = seq(-1, 1, 0.02), cexCol = 1.0, cexRow = 0.25, treeheight = c(0, 30), legend=TRUE)
```
We can use a dimensionality reduction technique called principal components analysis to collapse this dataset into a smaller number of features which enables the plotting of each college on a two dimensional plot which captures most of the variation between the colleges. In the bottom right you can see a cluster of the graduate colleges. In the bottom left are the most prestigious colleges (Christ Church, St John's, Magdalen). In the main cluster the newer and more relaxed colleges are towards the upper right.
```{r PCA, fig.width=10, fig.asp=1}
# compute PCA
ox_pca <- prcomp(scale_matrix_ox, scale. = TRUE, center = TRUE)
percent_variance_explained <- signif(100 * (ox_pca$sdev)^2 / sum(ox_pca$sdev^2),3)
ox_pca_dt <- data.table(ox_pca$x[, 1:4], oxford_meta)
# plot PCA
ggplot(ox_pca_dt, aes(PC1, PC2, colour = Year_of_Foundation)) +
geom_point() +
theme(legend.position="bottom") +
geom_label_repel(aes(label = Name), size = 4, force = 4) +
xlab(paste0("1st Principal Component (", percent_variance_explained[1], "% variance explained)")) +
ylab(paste0("2nd Principal Component (", percent_variance_explained[2], "% variance explained)"))
```
We could also create a dendrogram ('family tree') of the colleges using the principal components but we get largely the same result. The main difference being All Souls is now grouped in with the other graduate colleges.
```{r compare_dendrograms, fig.width=10, fig.asp=1}
# recluster with pc's
clustering_pca <- hclust(dist(ox_pca$x[, 1:9]), "ward.D2")
# clustering is similar
tanglegram(as.dendrogram(clustering), as.dendrogram(clustering_pca), margin_inner = 8)
```
```{r, fig.height=9, fig.width=9, include=FALSE}
# tSNE is another dimensionality reduction technique, but *non* linear (PCA is linear)
# library(Rtsne)
# ox_tsne_raw <- Rtsne(scale_matrix_ox, perplexity = 5)$Y
# ox_tsne <- Rtsne(ox_pca$x[,1:9], perplexity = 5)$Y
# plot tSNE
# ggplot(ox_pca_dt, aes(V1, V2, colour=Foundation)) +
# geom_point() +
# geom_label_repel(aes(label=Name), size=2)
```
While I have the cleaned data I thought I could also improve the visualisation of the Norrington scores (a measure of academic performance), for which the current plot on wikipedia had so many overlapping lines it was hard to extract any information. I recreate it below:
```{r original_norrington, fig.width=10, fig.asp=1}
# Data from https://www.ox.ac.uk/about/facts-and-figures/undergraduate-degree-classifications
# Load data
norrington <- fread("data/Norrington/Cleaned-Table 1.csv")
# make each row a score for each college in a given year
norrington <- melt(norrington, "V1")
# rename columns
names(norrington) <- c("College", "Year", "Score")
# order College by mean score
norrington$College <- factor(norrington$College, levels = norrington[, mean(Score, na.rm = TRUE), by = College][order(-V1)]$College)
# save for PCA analysis
fwrite(norrington[,.(Norrington_Score=signif(mean(Score, na.rm = TRUE),4)), by=College][order(Norrington_Score)], "data/Norrington_mean.tsv")
# simplify years e.g. 2015/2016 -> 2016
norrington$Year <- as.integer(gsub("[0-9]+/", "", norrington$Year))
# remove PPH's due to low student numbers
pphs <- c("Blackfriars",
"Campion Hall",
"Greyfriars",
"Regent's Park",
"Ripon",
"St Benet's Hall",
"St Stephen's House",
"Wycliffe Hall")
norrington_2 <- norrington[!College %in% pphs]
# Calculate ranking
invisible(norrington_2[, Rank := as.integer(rank(-Score)), by = Year])
# plot all together
ggplot(norrington_2, aes(Year, Rank, group=College, colour=College)) +
geom_line(size=0.8) +
scale_y_reverse(breaks = unique(norrington_2$Rank), minor_breaks=NULL) +
scale_x_discrete(expand=c(0 , 2), position="top") +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c(rep(brewer.pal(10, "Paired"), 3))) +
geom_text_repel(
data = subset(norrington_2, Year == "2016"),
aes(label = gsub(" College", "", College)),
size = 3,
nudge_x = 500
)
```
I think a clearer way to display this data is to break the rankings down by college. You can see most of the college ranking fluctuate a lot as their scores are very similar. It should also be noted that different subjects award different proportions of degree classes and each college has a different proportion of subjects which biases the rankings.
```{r new_norrington, fig.width=10, fig.asp=1}
# create plot
ggplot(norrington_2, aes(Year, Rank)) +
geom_line(size = 0.8) +
scale_y_reverse(breaks = c(1, 10, 20, 30), minor_breaks = 1:30) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
facet_wrap(~College)
```