generated from jtr13/EDAVtemplate
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path04-missing.Rmd
99 lines (82 loc) · 3.15 KB
/
04-missing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Missing values
There is no missing value found from our raw data source, partly for the reason that all the data sets we use are about climate and most are provided by NASA and NOAA. Here are missing value plots of some data sets.
```{r}
library(tidyverse)
library(patchwork)
```
```{r}
plot_missing <- function(b,percent=FALSE, subtitle=""){
dt1 <- as.data.frame(colSums(apply(b,2,is.na)))
colnames(dt1) <- "num"
dt1 <- arrange(dt1,desc(num))
dt1$x <- rownames(dt1)
dt1$x <- factor(dt1$x,levels = dt1$x,labels=dt1$x)
p1 <- ggplot(dt1 ,aes(x = x,y =num))+geom_bar(stat = "identity",fill = "lightblue")+
ylab("num rows\nmissing:")+
xlab("")+
ggtitle("Missing value patterns", subtitle = subtitle)+
theme_bw()+
theme(panel.grid.major.x = element_blank())+
scale_y_continuous(breaks = c(0,10,20))
p1_ <- ggplot(dt1 ,aes(x = x,y =(num/nrow(b))*100))+geom_bar(stat = "identity",fill = "lightblue")+
ylab("% rows\nmissing:")+
xlab("")+
ggtitle("Missing value patterns", subtitle = subtitle)+
theme_bw()+
theme(panel.grid.major.x = element_blank())+
ylim(0,100)
missing_patterns <- data.frame(is.na(b)) %>%
group_by_all() %>%
count(name = "count", sort = TRUE) %>%
ungroup()
dt2 <- missing_patterns
dt2$z <- ifelse(rowSums(dt2) == dt2$count,"yes","no")
dt2$y <- as.factor(rownames(dt2))
dt2$y <- factor(dt2$y,levels=rev(dt2$y))
dt3 <- gather(dt2,-y,-count,-z,key="key",value="value")
dt3$key <- factor(dt3$key,levels = dt1$x)
p2 <- ggplot(dt3,aes(key,y,fill=value))+geom_tile(aes(alpha =z) ,colour="white")+
ylab("missing pattern")+
xlab("variable")+
scale_fill_manual(values = c("grey","purple"))+
scale_alpha_manual(values = c(0.4,0.9))+
annotate("text",x=ncol(b)/2,y=which(rev(dt2$z)=="yes"),label = "complete cases")+
theme(legend.position = "none")
p3 <- ggplot(dt2,aes(count,y))+geom_bar(stat = "identity",fill = "lightblue")+
ylab("")+
xlab("row count")+
xlim(0,max(dt3$count))+
theme_bw()+
theme(panel.grid.major.y = element_blank())
p3_ <- ggplot(dt2,aes((count/nrow(b))*100,y))+geom_bar(stat = "identity",fill = "lightblue")+
ylab("")+
xlab("% rows")+
xlim(0,100)+
theme_bw()+
theme(panel.grid.major.y = element_blank())
area1 <- area(1,1,1,3)
area2 <- area(1,4,1,4)
area3 <- area(2,1,4,3)
area4 <- area(2,4,4,4)
areas <- c(area1,area2,area3,area4)
if (percent == FALSE) {
print((p1 + plot_spacer()+ p2 + p3) + plot_layout(design = areas))
}else{
print((p1_ + plot_spacer()+ p2 + p3_) + plot_layout(design = areas))
}
}
```
```{r}
yearly_temp <- read.table('data/clean/graph.txt', header = TRUE)
plot_missing(yearly_temp, subtitle = "Global Yearly Temperature")
```
```{r}
CO2 <- read.table("https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.txt", header = FALSE, comment.char = "#")
plot_missing(CO2, subtitle = "Global Monthly CO2")
```
```{r}
df <- read.table("data/clean/ice_sheet_antarctica_mass_200204_202109.txt",
skip=31,
col.names = c('TIME', 'antarctica', 'antarctica_sigma'))
plot_missing(CO2, subtitle = "Ice Sheet Mass in Antarctica")
```