### Author: Judith ter Schure ### E-mail: schure@cwi.nl ### Institute: CWI - Machine Learning group ### CWI is the national research institute ### for mathematics and computer science in the Netherlands ### Science Park 123, 1098 XG Amsterdam, NETHERLANDS ### Date: 16 November 2020 ### Licence: CC0 ### This code is part of the blog post: ### Ter Schure (2020) Accumulation Bias: How to handle it ALL-IN library(ggplot2) #### #### Figure 1 & 2 #### # numSim.study = number of simulated first studies # you need 1/(0.025*0.025) = 1600 first studies for each series starting with two significant studies # 40000 series, so 64 milion studies for smooth plot (takes ~2 minutes for simulation + plotting) numSim.study <- 64000000 Z1 <- rnorm(numSim.study) Z2 <- rnorm(numSim.study) Z3 <- rnorm(numSim.study) # selection based on Gold Rush accumulation bias A(3) = 1 A3 <- which((Z1 > 1.96) & (Z2 > 1.96)) numSim.3series <- length(A3) calcZmeta <- function(Zs) { t <- length(Zs) 1/sqrt(t)*sum(Zs) } # meta Zscores for a random sample of 3-study series Zmeta3 <- sapply(sample(1:numSim.study, size = numSim.3series), function(i) calcZmeta(c(Z1[i], Z2[i], Z3[i]))) # meta Zscores for a biased sample of 3-study series, biased by GoldRush A(3) = 1 Zmeta3.A3 <- sapply(A3, function(i) calcZmeta(c(Z1[i], Z2[i], Z3[i]))) dataZmeta.cond <- data.frame(Zscore = c(Zmeta3, Zmeta3.A3), GoldRush = c(rep("Zmeta3", times = numSim.3series), rep("Zmeta3.A3", times = numSim.3series))) ggplot(dataZmeta.cond) + geom_histogram(aes(x = Zscore, y = ..density.., fill = GoldRush), alpha = 0.2, bins = 120, position = "identity") + scale_fill_discrete(name = "Gold Rush A(t)", labels = c(expression(z^(3)), expression(paste("A(3) = 1 ", z^(3))))) #### #### Figure 3 #### typeIerror.pink <- mean(Zmeta3 > 1.96) typeIerror.pink typeIerror.blue <- mean(Zmeta3.A3 > 1.96) typeIerror.blue ### ### Figure 4, 5 ### A1notA2 <- which(Z1 <= 1.96) A2notA3 <- which((Z1 > 1.96) & (Z2 <= 1.96)) # meta Zscores for a biased sample of 1-study series, biased by GoldRush A(1) = 1 and A(2) = 0 # meta Zscore of a single study is its study Zscore Zmeta1.A1notA2 <- Z1[A1notA2] # meta Zscores for a biased sample of 2-study series, biased by GoldRush A(2) = 1 and A(3) = 0 Zmeta2.A2notA3 <- sapply(A2notA3, function(i) calcZmeta(c(Z1[i], Z2[i]))) dataZmeta.cond <- rbind(dataZmeta.cond, data.frame(Zscore = c(sample(Zmeta1.A1notA2, size = numSim.3series), sample(Zmeta2.A2notA3, size = numSim.3series)), GoldRush = c(rep("Zmeta1.A1notA2", times = numSim.3series), rep("Zmeta2.A2notA3", times = numSim.3series)))) dataZmeta.cond$GoldRush <- factor(dataZmeta.cond$GoldRush, levels = c("Zmeta3", "Zmeta1.A1notA2", "Zmeta2.A2notA3", "Zmeta3.A3")) ggplot(dataZmeta.cond) + geom_histogram(aes(x = Zscore, y = ..density.., fill = GoldRush), alpha = 0.2, bins = 120, position = "identity") + scale_fill_discrete(name = "Gold Rush A(t)", labels = c(expression(z^(3)), expression(paste("A(1) = 1, A(2) = 0 ", z^(1))), expression(paste("A(2) = 1, A(3) = 0 ", z^(2))), expression(paste("A(3) = 1 ", z^(3))))) ### ### Figure 6, 7 ### dataZmeta.unc <- data.frame(Zscore = c(Zmeta1.A1notA2, # almost 64 million samples, beware in histogram Zmeta2.A2notA3, Zmeta3.A3), GoldRush = c(rep("Zmeta1.A1notA2", times = length(A1notA2)), rep("Zmeta2.A2notA3", times = length(A2notA3)), rep("Zmeta3.A3", times = numSim.3series))) ggplot(dataZmeta.unc[sample(1:nrow(dataZmeta.unc), size = 3*numSim.3series), ]) + # histogram << 64 million geom_histogram(aes(x = Zscore, y = ..count.., fill = GoldRush), alpha = 0.2, bins = 120, position = "identity") + scale_fill_discrete(name = "Gold Rush A(t)", labels = c(expression(paste("A(1) = 1, A(2) = 0 ", z^(1))), expression(paste("A(2) = 1, A(3) = 0 ", z^(2))), expression(paste("A(3) = 1 ", z^(3))))) ### ### Figure 10, 11 ### numSim.study <- 64000 # we're not plotting histograms, so a smaller simulation will do Z1 <- rnorm(numSim.study) Z2 <- rnorm(numSim.study) Z3 <- rnorm(numSim.study) A1notA2 <- which(Z1 <= 1.96) A2notA3 <- which((Z1 > 1.96) & (Z2 <= 1.96)) A3 <- which((Z1 > 1.96) & (Z2 > 1.96)) calcLR <- function(Zs) { prod(dnorm(Zs, mean = 1)/dnorm(Zs, mean = 0)) } LR1.A1notA2 <- sapply(A1notA2, function(i) calcLR(Z1[i])) LR1.A2notA3 <- sapply(A2notA3, function(i) calcLR(Z1[i])) LR1.A3 <- sapply(A3, function(i) calcLR(Z1[i])) LR2.A2notA3 <- sapply(A2notA3, function(i) calcLR(c(Z1[i], Z2[i]))) LR2.A3 <- sapply(A3, function(i) calcLR(c(Z1[i], Z2[i]))) LR3.A3 <- sapply(A3, function(i) calcLR(c(Z1[i], Z2[i], Z3[i]))) dataLR.unc <- data.frame(t = c(rep(0:1, each = length(A1notA2)), rep(0:2, each = length(A2notA3)), rep(0:3, each = length(A3))), LR = c(rep(1, times = length(A1notA2)), LR1.A1notA2, rep(1, times = length(A2notA3)), LR1.A2notA3, LR2.A2notA3, rep(1, times = length(A3)), LR1.A3, LR2.A3, LR3.A3), series = c(rep(A1notA2, times = 2), rep(A2notA3, times = 3), rep(A3, times = 4)), GoldRush = c(rep("A1notA2", times = length(A1notA2)*2), rep("A2notA3", times = length(A2notA3)*3), rep("A3", times = length(A3)*4))) ggplot(dataLR.unc) + geom_line(aes(x = t, y = LR, group = series, colour = GoldRush)) + scale_color_discrete(name = "Gold Rush A(t)", labels = c(expression(paste("A(1) = 1, A(2) = 0, A(3) = 0 ", LR[10]^(t))), expression(paste("A(1) = 1, A(2) = 1, A(3) = 0 ", LR[10]^(t))), expression(paste("A(1) = 1, A(2) = 1, A(3) = 1 ", LR[10]^(t))))) ### ### Figure 12 ### typeIerrorLR <- mean(c(LR1.A1notA2, pmax(LR1.A2notA3, LR2.A2notA3), pmax(LR1.A3, LR2.A3, LR3.A3)) > 20) typeIerrorLR typeIerrorLR <- mean(c(LR1.A1notA2, LR2.A2notA3, LR3.A3) > 20) typeIerrorLR ### ### Figure 13 ### alpha <- 0.025 mu1.A1notA2 <- mean(Zmeta1.A1notA2) mu1.A1notA2 mu2.A2notA3 <- mean(Zmeta2.A2notA3) mu2.A2notA3 mu3.A3 <- mean(Zmeta3.A3) mu3.A3 mean(dataZmeta.unc$Zscore) mu1.A1notA2*(1 - alpha) + mu2.A2notA3*alpha*(1 - alpha) + mu3.A3*alpha^2