### Author: Judith ter Schure
### E-mail: schure@cwi.nl
### Institute: CWI - Machine Learning group
###            CWI is the national research institute 
###            for mathematics and computer science in the Netherlands
###            Science Park 123, 1098 XG Amsterdam, NETHERLANDS
### Date: 16 November 2020
### Licence: CC0

### This code is part of the blog post:
### Ter Schure (2020) Accumulation Bias: How to handle it ALL-IN


library(ggplot2)

####
#### Figure 1 & 2
####

# numSim.study = number of simulated first studies
# you need 1/(0.025*0.025) = 1600 first studies for each series starting with two significant studies
# 40000 series, so 64 milion studies for smooth plot (takes ~2 minutes for simulation + plotting)
numSim.study <- 64000000

Z1 <- rnorm(numSim.study)
Z2 <- rnorm(numSim.study)
Z3 <- rnorm(numSim.study)

# selection based on Gold Rush accumulation bias A(3) = 1
A3 <- which((Z1 > 1.96) & (Z2 > 1.96))
numSim.3series <- length(A3)

calcZmeta <- function(Zs) {
  t <- length(Zs)
  1/sqrt(t)*sum(Zs)
}

# meta Zscores for a random sample of 3-study series 
Zmeta3 <- sapply(sample(1:numSim.study, size = numSim.3series), function(i) calcZmeta(c(Z1[i], Z2[i], Z3[i])))

# meta Zscores for a biased sample of 3-study series, biased by GoldRush A(3) = 1
Zmeta3.A3 <- sapply(A3, function(i) calcZmeta(c(Z1[i], Z2[i], Z3[i])))

dataZmeta.cond <- data.frame(Zscore = c(Zmeta3, Zmeta3.A3), 
                             GoldRush = c(rep("Zmeta3", times = numSim.3series),
                                          rep("Zmeta3.A3", times = numSim.3series)))
ggplot(dataZmeta.cond) +
  geom_histogram(aes(x = Zscore, y = ..density.., fill = GoldRush),
                 alpha = 0.2, bins = 120, position = "identity") +
  scale_fill_discrete(name = "Gold Rush A(t)",
                      labels = c(expression(z^(3)),
                                 expression(paste("A(3) = 1     ", z^(3)))))

####
#### Figure 3
####

typeIerror.pink <- mean(Zmeta3 > 1.96)
typeIerror.pink
typeIerror.blue <- mean(Zmeta3.A3 > 1.96)
typeIerror.blue


###
### Figure 4, 5
###

A1notA2 <- which(Z1 <= 1.96)
A2notA3 <- which((Z1 > 1.96) & (Z2 <= 1.96))

# meta Zscores for a biased sample  of 1-study series, biased by GoldRush A(1) = 1 and A(2) = 0
# meta Zscore of a single study is its study Zscore
Zmeta1.A1notA2 <- Z1[A1notA2]

# meta Zscores for a biased sample  of 2-study series, biased by GoldRush A(2) = 1 and A(3) = 0
Zmeta2.A2notA3 <- sapply(A2notA3, function(i) calcZmeta(c(Z1[i], Z2[i])))



dataZmeta.cond <- rbind(dataZmeta.cond,
                        data.frame(Zscore = c(sample(Zmeta1.A1notA2, size = numSim.3series),
                                              sample(Zmeta2.A2notA3, size = numSim.3series)),
                                   GoldRush = c(rep("Zmeta1.A1notA2", times = numSim.3series),
                                                rep("Zmeta2.A2notA3", times = numSim.3series))))
dataZmeta.cond$GoldRush <- factor(dataZmeta.cond$GoldRush,
                                  levels = c("Zmeta3", "Zmeta1.A1notA2", "Zmeta2.A2notA3", "Zmeta3.A3"))
ggplot(dataZmeta.cond) +
  geom_histogram(aes(x = Zscore, y = ..density.., fill = GoldRush),
                 alpha = 0.2, bins = 120, position = "identity") +
  scale_fill_discrete(name = "Gold Rush A(t)",
                      labels = c(expression(z^(3)),
                                 expression(paste("A(1) = 1, A(2) = 0     ", z^(1))),
                                 expression(paste("A(2) = 1, A(3) = 0     ", z^(2))),
                                 expression(paste("A(3) = 1     ", z^(3)))))

###
### Figure 6, 7
###

dataZmeta.unc <- data.frame(Zscore = c(Zmeta1.A1notA2,  # almost 64 million samples, beware in histogram
                                       Zmeta2.A2notA3, Zmeta3.A3),
                            GoldRush = c(rep("Zmeta1.A1notA2", times = length(A1notA2)),
                                         rep("Zmeta2.A2notA3", times = length(A2notA3)),
                                         rep("Zmeta3.A3", times = numSim.3series)))


ggplot(dataZmeta.unc[sample(1:nrow(dataZmeta.unc), size = 3*numSim.3series), ]) + # histogram << 64 million
  geom_histogram(aes(x = Zscore, y = ..count.., fill = GoldRush),
                 alpha = 0.2, bins = 120, position = "identity") +
  scale_fill_discrete(name = "Gold Rush A(t)",
                      labels = c(expression(paste("A(1) = 1, A(2) = 0     ", z^(1))),
                                 expression(paste("A(2) = 1, A(3) = 0     ", z^(2))),
                                 expression(paste("A(3) = 1     ", z^(3)))))


###
### Figure 10, 11
###

numSim.study <- 64000  # we're not plotting histograms, so a smaller simulation will do

Z1 <- rnorm(numSim.study)
Z2 <- rnorm(numSim.study)
Z3 <- rnorm(numSim.study)

A1notA2 <- which(Z1 <= 1.96)
A2notA3 <- which((Z1 > 1.96) & (Z2 <= 1.96))
A3 <- which((Z1 > 1.96) & (Z2 > 1.96))

calcLR <- function(Zs) {
  prod(dnorm(Zs, mean = 1)/dnorm(Zs, mean = 0))
}

LR1.A1notA2 <- sapply(A1notA2, function(i) calcLR(Z1[i]))
LR1.A2notA3 <- sapply(A2notA3, function(i) calcLR(Z1[i]))
LR1.A3 <- sapply(A3, function(i) calcLR(Z1[i]))
LR2.A2notA3 <- sapply(A2notA3, function(i) calcLR(c(Z1[i], Z2[i])))
LR2.A3 <- sapply(A3, function(i) calcLR(c(Z1[i], Z2[i])))
LR3.A3 <- sapply(A3, function(i) calcLR(c(Z1[i], Z2[i], Z3[i])))

dataLR.unc <- data.frame(t = c(rep(0:1, each = length(A1notA2)),
                               rep(0:2, each = length(A2notA3)), 
                               rep(0:3, each = length(A3))),
                         LR = c(rep(1, times = length(A1notA2)), LR1.A1notA2,
                                rep(1, times = length(A2notA3)), LR1.A2notA3, LR2.A2notA3,
                                rep(1, times = length(A3)), LR1.A3, LR2.A3, LR3.A3),
                         series = c(rep(A1notA2, times = 2), 
                                    rep(A2notA3, times = 3),
                                    rep(A3, times = 4)),
                         GoldRush = c(rep("A1notA2", times = length(A1notA2)*2),
                                      rep("A2notA3", times = length(A2notA3)*3),
                                      rep("A3", times = length(A3)*4)))
ggplot(dataLR.unc) +
  geom_line(aes(x = t, y = LR, group = series, colour = GoldRush)) +
  scale_color_discrete(name = "Gold Rush A(t)",
                       labels = c(expression(paste("A(1) = 1, A(2) = 0, A(3) = 0     ", LR[10]^(t))),
                                  expression(paste("A(1) = 1, A(2) = 1, A(3) = 0     ", LR[10]^(t))),
                                  expression(paste("A(1) = 1, A(2) = 1, A(3) = 1     ", LR[10]^(t)))))

###
### Figure 12
###

typeIerrorLR <- mean(c(LR1.A1notA2, 
                       pmax(LR1.A2notA3, LR2.A2notA3), 
                       pmax(LR1.A3, LR2.A3, LR3.A3))
                     > 20)
typeIerrorLR

typeIerrorLR <- mean(c(LR1.A1notA2, 
                       LR2.A2notA3, 
                       LR3.A3)
                     > 20)
typeIerrorLR



###
### Figure 13
###

alpha <- 0.025

mu1.A1notA2 <- mean(Zmeta1.A1notA2)
mu1.A1notA2
mu2.A2notA3 <- mean(Zmeta2.A2notA3)
mu2.A2notA3
mu3.A3 <- mean(Zmeta3.A3)
mu3.A3

mean(dataZmeta.unc$Zscore)
mu1.A1notA2*(1 - alpha) + mu2.A2notA3*alpha*(1 - alpha) + mu3.A3*alpha^2