Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add figure-loss-small-data animint as a test #123

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Binary file added tests/testthat/loss.small.rds
Binary file not shown.
Binary file added tests/testthat/nb.rds
Binary file not shown.
186 changes: 186 additions & 0 deletions tests/testthat/test-renderer3-figure-loss-small-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
acontext("FigureLossSmall")
library(data.table)
library(animint2)
library(jointseg)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use jointseg:: instead of library(jointseg)


loss.small <- readRDS("loss.small.rds")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of adding another data set to animint2 (which is already almost too big for CRAN), would it be possible to compute these data? if not, a new data set is OK, but please save it as data/something.RData that we can access via data(something,package="animint2")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible to compute these data instead of storing it. My first attempt at computing the data increases the runtime and memory of the test. Currently it takes 1.3 Gb and 600 seconds on my machine. There may be ways to reduce that by optimizing how loss.small and nb are computed. I will spend some time investigating it:
https://github.com/tdhock/changepoint-data-structure/

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be great to use a simple/small version of the data (instead of the full/large data), as long as it captures the essence of the issue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure how many clickSelects/showSelected values are there, but typically only 2 subsets are necessary for a test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I rewrote the test to compute data instead of storing it. Using a small subset of data reduces the runtime. The sample function controls the size of the subset:
sample(1:.N, 5)


prop.dt <- loss.small[, {
n.selected <- sum(selected=="yes")
data.table(
n.selected,
max.loss=max(loss),
prop.selected=n.selected/.N,
models=.N)
}, by=list(profile.id, chromosome)]

set.seed(100)
(some.props <- prop.dt[
sample(1:.N, 5)][, .SD[1], by=list(models, prop.selected)])
ggplot()+
scale_x_log10(
"Number of data to segment",
breaks=c(range(some.props$models), 10, 100))+
geom_point(aes(
models, prop.selected),
shape=21,
data=some.props)

some.loss <- loss.small[some.props, on=list(profile.id, chromosome)]
some.nb <- readRDS("nb.rds")
some.segs <- some.nb[, {
fit <- jointseg::Fpsn(logratio, .N)
seg.list <- list()
for(model.i in 1:.N){
end.vec <- fit$t.est[model.i, 1:model.i]
start.vec <- c(1, end.vec[-model.i]+1)
for(seg.i in 1:model.i){
start <- start.vec[[seg.i]]
end <- end.vec[[seg.i]]
seg.list[[paste(model.i, seg.i)]] <- data.table(
start,
end,
changes=model.i-1,
segStart=position[start],
segEnd=position[end],
mean=mean(logratio[start:end]))
}
}
do.call(rbind, seg.list)
}, by=list(profile.id, chromosome)]
some.selection <- some.loss[selected=="yes", {
penaltyLearning::modelSelection(.SD, complexity="changes")
}, by=list(profile.id, chromosome)]
some.breaks <- some.segs[0 < changes, list(
megabases=(segStart[-1]+segEnd[-.N])/2e6
), by=list(profile.id, chromosome, changes)]
some.segs[, beforeStart := {
c(
segStart[1]-1,
(segStart[-1]+segEnd[-.N])/2)/1e6
}, by=list(profile.id, chromosome, changes)]
some.segs[, afterEnd := {
c(
(segStart[-1]+segEnd[-.N])/2,
segEnd[.N]+1)/1e6
}, by=list(profile.id, chromosome, changes)]

some.nb[, megabases := position/1e6]
some.nb[, pid.chr := paste0(profile.id, ".", chromosome)]
some.breaks[, pid.chr := paste0(profile.id, ".", chromosome)]
some.loss[, pid.chr := paste0(profile.id, ".", chromosome)]
some.loss[, penalty := 0]
some.loss[, min.changes := changes-0.5]
some.loss[, max.changes := changes+0.5]
some.props[, pid.chr := paste0(profile.id, ".", chromosome)]
some.selection[, pid.chr := paste0(profile.id, ".", chromosome)]
some.segs[, pid.chr := paste0(profile.id, ".", chromosome)]
some.selection[, cost.at.min.lambda := min.lambda*changes+loss]
viz <- animint(
title="Changepoint model selection",
selected=ggplot()+
theme_bw()+
ggtitle("Click to select a data set")+
scale_x_log10(
"Number of data to segment",
breaks=c(range(some.props$models), 10, 100))+
ylab("Proportion of models selected by linear penalty")+
geom_point(aes(
models, prop.selected),
shape=21,
size=4,
clickSelects="pid.chr",
alpha=0.7,
data=data.frame(some.props)),
data=ggplot()+
ggtitle("Selected data set and model")+
theme_bw()+
##theme_animint(update_axes=c("x", "y"))+
xlab("Position on chromosome (mega bases)")+
ylab("logratio (approximate DNA copy number")+
geom_segment(aes(
beforeStart, mean,
xend=afterEnd, yend=mean),
showSelected=c("pid.chr", "changes"),
color="green",
data=data.frame(some.segs))+
geom_vline(aes(
xintercept=megabases),
showSelected=c("pid.chr", "changes"),
color="green",
chunk_vars="pid.chr",
linetype="dashed",
data=data.frame(some.breaks))+
geom_point(aes(
megabases, logratio),
shape=21,
fill=NA,
showSelected="pid.chr",
data=data.frame(some.nb)),
loss=ggplot()+
ggtitle("Loss values for selected data set")+
ylab("loss")+
xlab("changes")+
theme_bw()+
theme_animint(update_axes=c("x", "y"))+
scale_size_manual(values=c(yes=3, no=4))+
geom_text(aes(
models, max.loss, label=paste0(
n.selected, "/", models,
" models selected by linear penalty")),
showSelected="pid.chr",
hjust=1,
data=data.frame(some.props))+
geom_point(aes(
changes, loss, color=selected, size=selected),
shape=21,
fill=NA,
showSelected="pid.chr",
data=data.frame(some.loss))+
geom_tallrect(aes(
xmin=min.changes, xmax=max.changes),
data=data.frame(some.loss),
alpha=0.5,
showSelected="pid.chr",
clickSelects="changes"),
lines=ggplot()+
ggtitle("Cost functions for selected data set")+
theme_bw()+
theme_animint(update_axes=c("x", "y"))+
geom_point(aes(
penalty, loss, color=selected),
fill=NA,
size=4,
showSelected="pid.chr",
data=data.frame(some.loss))+
geom_point(aes(
min.lambda, cost.at.min.lambda),
showSelected="pid.chr",
fill=NA,
data=data.frame(some.selection))+
ylab("cost(penalty) = loss + penalty*changes")+
geom_abline(aes(
slope=changes, intercept=loss, color=selected),
size=1,
showSelected="pid.chr",
chunk_vars="pid.chr",
data=data.frame(some.loss))+
geom_abline(aes(
slope=changes, intercept=loss, color=selected),
size=2,
showSelected=c("pid.chr", "changes"),
chunk_vars="pid.chr",
data=data.frame(some.loss))
)
info <- animint2HTML(viz)

test_that("5 <circle> rendered for all data sets", {
dataSets=getNodeSet(info$html, '//g[@class="geom1_point_selected"]//g//circle')
expect_equal(length(dataSets), 5)
siddhesh195 marked this conversation as resolved.
Show resolved Hide resolved
})

test_that("72 <rect> rendered for selected data set", {
nRect=getNodeSet(info$html, '//g[@class="geom7_tallrect_loss"]//g//rect')
expect_equal(length(nRect), 72)
})

Loading