Workflow for weighting stypes

library(stype)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(lenses)

`weight`ing a `stype` to update the `data_summary`

The weight function updates a the data_summary of a stype vector with a weight.

a single `stype` vector

x <- v_binary(c(TRUE, TRUE, TRUE, FALSE))
w <- c(10, 10, 1, 1)
get_data_summary(x)
#> An object of class  "data_summary" 
#> $n
#> [1] 4
#> 
#> $has_missing
#> [1] FALSE
#> 
#> $n_nonmissing
#> [1] 4
#> 
#> $n_missing
#> [1] 0
#> 
#> $proportion_missing
#> [1] 0
#> 
#> $is_constant
#> [1] FALSE
#> 
#> $num_0
#> [1] 1
#> 
#> $num_1
#> [1] 3
#> 
#> $proportion
#> [1] 0.75
#> 
#> $variance
#> [1] 0.25

z <- weight(x, w)
class(w)
#> [1] "numeric"
get_data_summary(z)
#> An object of class  "data_summary" 
#> $n
#> [1] 4
#> 
#> $has_missing
#> [1] FALSE
#> 
#> $n_nonmissing
#> [1] 4
#> 
#> $n_missing
#> [1] 0
#> 
#> $proportion_missing
#> [1] 0
#> 
#> $is_constant
#> [1] FALSE
#> 
#> $num_0
#> [1] 1
#> 
#> $num_1
#> [1] 3
#> 
#> $proportion
#> [1] 0.75
#> 
#> $variance
#> [1] 0.25
#> 
#> $weighted_proportion
#> [1] 0.9545455

However, the weights are not part of the data, thus when subsetting the vector the data_summary no longer includes the weighted summaries:

get_data_summary(z[1:2])
#> An object of class  "data_summary" 
#> $n
#> [1] 2
#> 
#> $has_missing
#> [1] FALSE
#> 
#> $n_nonmissing
#> [1] 2
#> 
#> $n_missing
#> [1] 0
#> 
#> $proportion_missing
#> [1] 0
#> 
#> $is_constant
#> [1] TRUE
#> 
#> $num_0
#> [1] 0
#> 
#> $num_1
#> [1] 2
#> 
#> $proportion
#> [1] 1
#> 
#> $variance
#> [1] 0

a list of `stype` vectors

Here, we use lenses to weight only certain variables within a tibble.

weight_var_l <- weight_l %.% lenses::index_l(1) %.% vec_data_l

otcm <- context(purpose = purpose(study_role = "outcome"))
# note the different ways of setting the context
dt <- tibble::tibble(
  x1 = v_binary(c(TRUE, FALSE, TRUE), context  = otcm),
  x2 = v_binary(c(FALSE, TRUE, FALSE)) %>% set(context_l, otcm),
  x3 = v_continuous(c(1.5, 1.6, 0.1), context = otcm),
  x4 = v_continuous(c(1.5, 1.6, 0.1)) %>%
    set(context_l, context(purpose = purpose(study_role = "weight"))),
)

tail(view(dt[[1]], data_summary_l), 2)
#> $proportion
#> [1] 0.6666667
#> 
#> $variance
#> [1] 0.3333333

# Just weight binary variables
dt1 <- over_map(dt, outcome_l %.% binary_l, function(x){ 
  weight(x, view(dt, weight_var_l))
})
dt1
#> # A tibble: 3 × 4
#>       x1     x2     x3     x4
#>   <bnry> <bnry> <cont> <cont>
#> 1      1      0    1.5    1.5
#> 2      0      1    1.6    1.6
#> 3      1      0    0.1    0.1
tail(view(dt1[[1]], data_summary_l), 2)
#> $variance
#> [1] 0.3333333
#> 
#> $weighted_proportion
#> [1] 0.5
tail(view(dt1[[3]], data_summary_l), 2)
#> $max
#> [1] 1.6
#> 
#> $qtiles
#>    1%  2.5%    5%   10%   25%   50%   75%   90%   95% 97.5%   99% 
#> 0.128 0.170 0.240 0.380 0.800 1.500 1.550 1.580 1.590 1.595 1.598

# Weight all outcomes
dt2 <- over_map(dt, outcome_l, function(x){ 
  weight(x, view(dt, weight_var_l))
})

tail(view(dt2[[1]], data_summary_l), 2)
#> $variance
#> [1] 0.3333333
#> 
#> $weighted_proportion
#> [1] 0.5
tail(view(dt2[[3]], data_summary_l), 2)
#> $qtiles
#>    1%  2.5%    5%   10%   25%   50%   75%   90%   95% 97.5%   99% 
#> 0.128 0.170 0.240 0.380 0.800 1.500 1.550 1.580 1.590 1.595 1.598 
#> 
#> $weighted_mean
#> [1] 1.50625

Estimating cumulative risk

ctimes <- list(
   v_continuous_nonneg(c(5, 6, 10, Inf, 1, Inf, 19), 
                internal_name = "cA"),
   v_continuous_nonneg(c(4, 1, 15, Inf, Inf, Inf, 21), 
                internal_name = "cB")
)

otimes <- list(
  v_continuous_nonneg(c(2, 6, 11, 12, Inf, Inf, 25),
               internal_name = "oA"),
  v_continuous_nonneg(c(1, Inf, 10, Inf, Inf, Inf, 23), 
               internal_name = "oB")
)

x1 <- v_rcensored(
  outcomes = otimes, censors = ctimes, end_time = 15,
  extra_descriptors = list(
    crisk = function(x) { 
      list( riskimator::cumrisk(x, w = riskimator::product_limit) )
    } ))


get_data_summary(x1, "crisk")
#> $time
#> [1]  1 10 12
#> 
#> $estimate
#> [1] 0.2 0.4 0.6
get_data_summary(x1[c(3,5,6)], "crisk")
#> $time
#> [1] 10
#> 
#> $estimate
#> [1] 0.5

Cumrisk in a data pipeline

library(riskimator)

dt <- tibble(
  x1 = v_rcensored(outcomes = otimes, censors = ctimes, end_time = 15),
  x2 = v_rcensored(outcomes = otimes[[1]], end_time = 15),
  x3 = v_binary(as.logical(rbinom(7, 1, prob = 0.5)),
                context  = context(purpose = purpose(study_role = "covariate"))),
)

est_cph <- function(x, df){
  cph <- survival::coxph(
    formula = as_Surv(x, censor_as_event = TRUE) ~ as_canonical(x3),
    data = df)
  exp(-predict(cph, type = "expected"))
}

weight_rcensored_outcomes <- function(dt){
  over_map(dt, rcensored_l, function(x) {
    over(x, data_summary_l, function(ds) { 
      c(ds, list(crisk = cumrisk(x, w = est_cph, df = dt)))
    })
  })
}
 
dt2 <- weight_rcensored_outcomes(dt)
view(dt2$x1, data_summary_l)
#> $n
#> [1] 7
#> 
#> $has_missing
#> [1] FALSE
#> 
#> $n_nonmissing
#> [1] 7
#> 
#> $n_missing
#> [1] 0
#> 
#> $proportion_missing
#> [1] 0
#> 
#> $is_constant
#> [1] FALSE
#> 
#> $person_time
#> [1] 55
#> 
#> $n_events
#> [1] 3
#> 
#> $outcome_reasons
#> x
#>   oA   oB <NA> 
#>    1    2    4 
#> 
#> $n_censored
#> [1] 2
#> 
#> $censor_reasons
#> x
#>   cA   cB <NA> 
#>    1    1    5 
#> 
#> $eair
#> [1] 0.05454545
#> 
#> $eair_variance
#> [1] 0.006923789
#> 
#> $crisk
#> $crisk$time
#> [1]  1 10 12
#> 
#> $crisk$estimate
#> [1] 0.1867779 0.3928251 0.5988723

2022-08-25

weighting a stype to update the data_summary

a single stype vector

a list of stype vectors

Estimating cumulative risk

Cumrisk in a data pipeline

`weight`ing a `stype` to update the `data_summary`

a single `stype` vector

a list of `stype` vectors