#=============================================================================== # # Computing emulators and History Matching using dgpsi and tinydancer # Potential new central procedure of the HighTune software # Original HTune software developed during coding sprints of 2017 # Authors : HighTune team, 2017 # Adaptations for novel emulation and history matching software January 2024 # Author: Daniel Williamson # # Main changes : # * Changed emulation to dgpsi - gp as default. Capacity for dgp and as_gp's # within package # * Changed History matching approach to use tinydancer for small space sampling # Author: Daniel Williamson - Jan 2024 # #=============================================================================== # PREAMBLE, NEEDS TO HAPPEN ONCE ON INSTALL AND NOT HERE, AS WITH LOADING MOGP # UPDATE dgpsi AND tinydancer #devtools::install_github("mingdeyu/dgpsi-R") #devtools::install_github("BayesExeter/tinydancer") #Set up the conda environment for the first time. This should move to install. #Later in this script we load dgpsi and call init_py as if the software is #already installed. #MUST RUN ONCE THE FIRST TIME, AGAIN CAN HAPPEN ON INSTALL NOT HERE #library(dgpsi) #init_py(reinstall=TRUE) ############# # Najda notes # la fonction init_py de la lib dgpsi-R fait deux choses : # - installe, via conda, la lib python dgpsi (dont dpgsi-R est l'interface) et ses dépendances # - initialise l'environnement, via reticulate # dans htexplo, # l'installation a été faite via le script setup.sh, avec pip install # l'initialisation de l'environnement est fait par les lignes ci dessous : library(reticulate) pkg_env <- get("pkg.env", envir = asNamespace("dgpsi")) assign('dgpsi', reticulate::import("dgpsi"), pkg_env) assign('py_buildin', reticulate::import_builtins(), pkg_env) assign('np', reticulate::import("numpy"), pkg_env) assign('copy', reticulate::import("copy"), pkg_env) assign('py_gc', reticulate::import("gc"), pkg_env) assign('dill', reticulate::import("dill"), pkg_env) assign('base64', reticulate::import("base64"), pkg_env) pkg_env$thread_num <- pkg_env$dgpsi$get_thread() #================================================================================ # Begin script-specific code twd <- getwd() source('htune_convert.R') #=============================================================================== # default values for optional arguments #=============================================================================== WAVEN=1 # Wave number of history matching (HM) tau=0 # number of metrics than can fail in the HM cutoff=3 # cut-off on the Implausibility sample_size=1000 # size of emulated sample (Xp) (this no longer needs to be large #It can in fact be as small as sample_size_next_design) sample_size_next_design=80 # sample size for true simulations for the next wave max_sample_size=3000000 # Maximum LH size (above this limit, Xp is splitted) sub_sample_size=300000 # size for splitting Xp for Implausibility computation maxlLHS=50 # Maximum number of lLHS npixels=40 # number of pixel in x and y for implausibility matrix 15 -> 40 print_level=0 # "debug" pour activer des prints diagnostics #meanFun="fitted" # mean function type for gaussian process [fitted|constant] DEPRECIATED # if print = "debug", recommend sample_size=20, sample_size_next_design=5 #=============================================================================== # Diagnostics functions : size and time #=============================================================================== htsize <- function(object,name) { print(paste(name,", type:",typeof(object),", length:",length(object),", memory:",format(object.size(object), units = "Mb"))) } prev_time <- Sys.time() httime <- function(where) { new_time <- Sys.time() print(paste("Time elapses in ",where," : ",new_time-prev_time)) } #=============================================================================== # Reading line args #=============================================================================== args = commandArgs(trailingOnly=TRUE) if (length(args)%%2!=0) { print(paste("Bad number of arguments",length(args),"in htune_EmulatingMultiMetric.R")); q("no",1) ; } if (length(args)>0) { for (iarg in seq(1,length(args),by=2)) { if (args[iarg]=="-wave") { WAVEN=as.numeric(args[iarg+1]) } else if (args[iarg]=="-tau") {tau=as.numeric(args[iarg+1]) } else if (args[iarg]=="-cutoff") {cutoff=as.numeric(args[iarg+1]) } else if (args[iarg]=="-sample_size") {sample_size=as.numeric(args[iarg+1]) } else if (args[iarg]=="-sample_size_next_design") {sample_size_next_design=as.numeric(args[iarg+1]) } # else if (args[iarg]=="-meanFun") {meanFun=args[iarg+1] } else if (args[iarg]=="-seed") { set.seed(as.numeric(args[iarg+1])) } else { print(paste("Bad argument",args[iarg],"in htune_Emulating_Multi_Metric_Multi_LHS.R")) ; q("no",1) ; } # quit with error code 1 } } #if ((meanFun != "fitted") & (meanFun != "constant")) { # print("Bad choice for meanFun [constant|fitted]") # q("no",1) #} #ABOVE IS DEPRECATED. EMULATORS NO LONGER HAVE A meanFun. #=============================================================================== # Reading results of a series of SCM simulations #=============================================================================== print(paste("Arguments : -wave",WAVEN,"-cutoff",cutoff,"-tau",tau,sep=" ")) source("ModelParam.R") load(file=paste("WAVE",WAVEN,"/Wave",WAVEN,"_SCM.Rdata",sep="")) load(file=paste("WAVE",WAVEN,"/Wave",WAVEN,"_REF.Rdata",sep="")) if ( nmetrique == 1) { tau=0 } cands <- names(tData)[1:(nparam+1)] # Selects columns with the parameters metric <- names(tData)[(nparam+2):(nparam+1+nmetrique)] # select columns with the metrics print("History Matching for parameters: ") print(cands[1:(nparam+1)]) print(c("nmetrique,tau=",nmetrique,tau)) #FIRST ISSUE TO CONSIDER, HOW TO HANDLE TOO MANY PARAMETERS. ARD/DESIGN/AS ALL OPTIONS #=============================================================================== print(" ======== Scatter plot metrics=f(param) ======== ") #=============================================================================== # Plotting for each parameter, the metric as a function of parameter values: # directly from the SCM outputs # Organizing sub-windows as a function of the number of plots # For plotting with real parameters on axes #=============================================================================== file = paste("WAVE",WAVEN,"/Plots_Metrics.pdf",sep="") pdf(file=file) param_SCM <- DesignConvert(tData[,1:nparam]) logs=array(1:nparam) ; for ( iparam in 1:nparam ) { logs[iparam]<-"" } logs[which.logs] <- "x" if(nparam*nmetrique<2){ par(mfrow = c(1, 1), mar=c(4, 4, 1, 1)) } else if(nparam*nmetrique <=4){ par(mfrow = c(2, 2), mar=c(4, 4, 1, 1)) } else if(nparam*nmetrique<=9){ par(mfrow = c(3, 3), mar=c(4, 4, 1, 1)) } else if(nparam*nmetrique<=16){ par(mfrow = c(4, 4), mar=c(4, 4, 1, 1)) } else if(nparam*nmetrique<=25){ par(mfrow = c(5,5), mar=c(4, 4, 1, 1)) } for (j in 1:nmetrique) { print(paste("Metrique [",j,"]",metric[j]," obs : ",tObs[j]," err : ",tObsErr[j])) } for ( iparam in 1:nparam ) { for (j in 1:nmetrique) { ymin=tObs[j]-sqrt(tObsErr[j]) ymax=tObs[j]+sqrt(tObsErr[j]) plot(param_SCM[,iparam],tData[,nparam+j+1],col=2,xlab=cands[iparam],ylab=metric[j],log=logs[iparam],ylim=range(tData[,nparam+j+1],ymin,ymax)) abline(v=param.defaults[iparam],col='blue',lty=2) abline(h=c(tObs[j],ymin,ymax ), col = 'blue', lty =2, lwd=c(1,3,3)) } } dev.off() print(paste("OK0")) print(paste("Created figure file: ", file, sep="")) #=============================================================================== print(" ====== Reading emulators of previous waves ======== ") #=============================================================================== #mogp_dir <- "./mogp_emulator" #source("BuildEmulator/BuildEmulator.R") library(dgpsi) #init_py() library(tinydancer) source("tinydancer_plots.R") EMULATOR.LIST <- list() tau_vec = c() cutoff_vec = c() taufile="tau-cutoff.Rdata" #Change type of objects loaded and how # We now have an emulator per metric stored in a dgpsi bundle object for (i in 1:WAVEN) { prefix=paste("WAVE",i,"/EMULATOR_MULT_METRIC_wave",i,sep="") if(file.exists(paste(prefix,".pkl",sep=""))){ tmp <- dgpsi::read(prefix) EMULATOR.LIST[[i]] = ifelse(names(tmp)[1]=="id", tmp, unpack(tmp)) rm(tmp) print(paste("An emulator has been loaded from ",prefix,sep="")) } } if (file.exists(taufile)) { load(taufile) } if (length(EMULATOR.LIST) == WAVEN ) { # We shall overwrite the cutoff or tau for the last wave tau_vec[WAVEN]=tau cutoff_vec[WAVEN]=cutoff } else{ #=============================================================================== print(paste(" ====== Computing emulator for wave ",WAVEN," ======== ")) #=============================================================================== #DW dgpsi implementation. Trial only with gp for now and all params ################################################################# # Building emulator for the new WAVE (if not already done) ################################################################# # variables 1:nparam correspond to input parameters #======================================================== # First fitting metrics (columns nparam+2:nparam+1+nmetrique of tData from file SCM.Rdata) #======================================================== #=============================================================================== print(" ============== Building emulators ============== ") #=============================================================================== tData <- as.matrix(tData) XX <- tData[,1:nparam,drop=F] YY <- tData[,-c(1:(nparam+1)),drop=F] vecchia_setting <- ifelse(nrow(XX)>400, TRUE, FALSE) gplist <- lapply(1:nmetrique, function(ii) { gp_ii <- gp(XX, YY[,ii,drop=F], nugget_est=T, vecchia=vecchia_setting, verb=F) if((nparam>12) || (nparam>floor(nrow(XX)/10))){#temporary ARD limit on emulator dims max_num <- min(c(12, floor(nrow(XX)/10))) least_active <- which(order(gp_ii$specs$lengthscales) > max_num) gp_ii <- gp(XX[,-least_active], YY[,ii,drop=F], nugget_est=T, vecchia=vecchia_setting, verb=F) } return(gp_ii) }) gplist <- lapply(gplist, function(e) validate(e)) file = paste("WAVE",WAVEN,"/Plots_LOO.pdf",sep="") pdf(file=file) #DIAGNOSTICS : verifying that the original SCMdata are well reproduced when eliminating the point from the emulator for (i in 1:nmetrique) { print(paste(" ====== Plotting LOO for metrique ", i, " ====== ")) if (!is.null(gplist[[i]])) { print(plot(gplist[[i]])) dev.flush() } else { print(paste("Warning: gplist[[", i, "]] is NULL, skipping plot.")) } print(paste("RMSE: ", gplist[[i]]$loo$rmse, "\t mean (emulator std-deviation for each prediction)", mean(gplist[[i]]$loo$std), sep="")) print(paste("NRMSE: ", gplist[[i]]$loo$nrmse * 100, "%", sep="")) } dev.off() #=============================================================================== print(" ============== Saving emulators ============== ") #=============================================================================== EMULATOR.LIST[[WAVEN]] = gplist tau_vec[WAVEN] = tau #SAVE EMULATOR FOR HISTORY MATCHING cutoff_vec[WAVEN] = cutoff #for (i in 1:WAVEN) { # prefix=paste("WAVE",i,"/EMULATOR_MULT_METRIC_wave",i,sep="") # save_ExUQmogp(EMULATOR.LIST[[i]], filename=prefix) # print(paste("An emulator has been saved under: ",prefix,sep="")) #} #DW: only need to save WAVEN prefix = paste("WAVE",i,"/EMULATOR_MULT_METRIC_wave",i,sep="") to_save <- ifelse(length(gplist)==1, gplist[[1]], dgpsi::pack(gplist)) dgpsi::write(to_save, prefix) save(tau_vec, cutoff_vec, file = taufile) } #=============================================================================== #=============================================================================== print(" ============== Beginning history matching ============== ") #=============================================================================== #=============================================================================== print(paste("Generating ",sample_size,"samples varying", nparam, "input parameters")) Disc = rep(0, nmetrique) # An implausibility function. Code can be moved to a code file single_wave_implausibility <- function(NewData, Emulators, Obs, Disc, ObsErr,valmax){ if(!is.null(names(Emulators))) stop("Emulator should be a list of emulators, even if only containing 1 emulator") n_em <- length(Emulators)#number of metrics timps <- sapply(1:n_em, function(kk){ subData <- NewData[which(colnames(tData) %in% colnames(Emulators[[kk]]$data$X))] tEm <- predict(Emulators[[kk]],subData)$results abs(Obs[kk] - tEm$mean)/sqrt(tEm$var + Disc[kk] + ObsErr[kk]) }) return(sort(timps)[n_em-(valmax-1)]) } #Below is a multi-points version for rejection sampling, not used. single_wave_implausibility_manypoints <- function(NewData, Emulators, Obs, Disc, ObsErr,valmax){ if(!is.null(names(Emulators))) stop("Emulator should be a list of emulators, even if only containing 1 emulator") n_em <- length(Emulators)#number of metrics timps <- sapply(1:n_em, function(kk){ subData <- NewData[,which(colnames(tData) %in% colnames(Emulators[[kk]]$data$X)),drop=F] tEm <- predict(Emulators[[kk]],subData)$results abs(Obs[kk] - tEm$mean)/sqrt(tEm$var + Disc[kk] + ObsErr[kk]) }) return(sort(timps)[n_em-(valmax-1)]) } #A multiwave implausibility function multi_wave_imp <- function(x, targetLevel, waves){ levels <- length(targetLevel) ans <- rep(Inf, levels) waveFail <- FALSE this.level <- 1 wave.num <- 1 Timp <- NA while((this.level <= levels) & !waveFail){ Timp <- single_wave_implausibility(NewData=x, Emulators = EMULATOR.LIST[[wave.num]], Obs = tObs, Disc=Disc, ObsErr = tObsErr,valmax=tau_vec[wave.num]+1) wave.num <- wave.num + 1 if ((Timp > targetLevel[this.level]) & (wave.num <= waves)){ waveFail <- TRUE } if((!waveFail) & (wave.num > waves)){ ans[this.level:levels] <- Timp this.level <- levels + 1 } else{ ans[this.level] <- Timp this.level <- this.level + 1 } } return(ans) } # Control parameters for tinydancer to sample small spaces. I will code this up for the # reliable and slower sampler, but the subset_sim verison is as easy to implement (see example on git) control_list <- list( volume_ratio = 0.1, num_mutations = 10, num_iterations = 100, box_limits = cbind(rep(0, nparam), rep(1, nparam)), #We should change parameter conversion codes to [0,1] estimate_volume=TRUE, print_every = 500, one_per_level = TRUE ) new_ladder <- construct_temperature_ladder( implausibility = function(x, target_level){multi_wave_imp(x, target_level, WAVEN)}, dims = nparam, target_levels = cutoff_vec, control_list = control_list ) wave_samples <- sample_slice_ptmcmc( new_ladder, n_iter = sample_size, implausibility = function(x, target_level){multi_wave_imp(x, target_level, WAVEN)}, control_list = control_list ) #With one_per_level = TRUE, there is a sample for each wave in one of the chains #Find the NROY chains for each wave NROYchains <- sapply(1:WAVEN, function(kk) dplyr::first(which(lapply(new_ladder$imp_levels, function(j) j[kk]) <= cutoff_vec[kk]))+1) #Draw WAVEN implausibility matrix. Some notes: #1. Simply substitute WAVEN below with any k <= WAVEN to draw a wave k plot from the samples, #2. The meaning of the density plots has changed. The samples are Unif(NROY), so the plot is # an estimate (at each pixel) of the (marginal for nparam>2) density of NROY in the d dimensions of X. ImpData <- wave_samples$uniform_sample_list[[NROYchains[WAVEN]]][,c(1:nparam, nparam+WAVEN)] colnames(ImpData) <- c(paste("X",1:nparam,sep=""),"I1")#Replace with actual parameter names ImpList01 <- impList_tinydancer(which_vars = 1:nparam, var_names = paste("X", 1:nparam, sep=""), ImpData=ImpData, resolution = c(npixels, npixels) ) #Not sure about this param.defaults.norm=unlist(DesignantiConvert1D(param.defaults)) imp.layout01(ImpList01, VarNames = paste("X", 1:nparam, sep=""), VariableDensity=F, newPDF=TRUE, the.title = paste0("InputSpace_wave", WAVEN, ".pdf"), Points=matrix(param.defaults.norm, ncol=nparam)) mtext(paste("Remaining space after wave",WAVEN,": ", new_ladder$volume_estimate, sep="")) dev.off() print(paste("Created figure file: ", "InputSpace_wave",WAVEN,".pdf", sep="")) #=============================================================================== # Creation of Wave WAVEN+1 sample #=============================================================================== param_SCM <- DesignConvert(ImpData[,1:nparam]) UFILE=paste("param_after_wave",WAVEN,sep="") head(param_SCM) logs=array(1:nparam) ; for ( iparam in 1:nparam ) { logs[iparam]<-"linear" } logs[which.logs] <- "log" for (i in 1:nparam) { line<-paste(names(tData)[i], min(param_SCM[,i]), max(param_SCM[,i]),param.defaults[i],logs[i],sep=" ") print(line) base::write(line, file=UFILE,append=TRUE) } print('OK') print(which.logs) #design.waveNext <- sample(nrow(XpNext), samplesz, rep = F) #Replace the above as we already have uniform samples it is pointless resampling now. #As the slice sampler is an MCMC (it is already thinned), we can thin some more if we have generated # sample_size >> sample_size_next_design points. WaveNext <- param_SCM[which((1:sample_size)%%ceiling(sample_size/sample_size_next_design)==1),] # THE OLD LINE BELOW LOOKS WRONG. IT SETS THE DESIGN IN THE TRANSFORMED SPACE AND NOT IN THE MODEL SPACE # NEED DISCUSSION #WaveNext <- XpNext[design.waveNext, ] save(WaveNext,file=paste("Wave",strtoi(WAVEN, base = 0L)+1,".RData",sep="")) print(paste("Next wave design has been saved under: ","Wave",strtoi(WAVEN, base = 0L)+1,".RData",sep=""))