now working smoothly

7ca2971a · Jan Kieseler · 35749a48 · 7ca2971a · 7ca2971a · 7ca2971a
Commit 7ca2971a authored 1 year ago by Jan Kieseler
--- a/modules/check.py
+++ b/modules/check.py
@@ -5,4 +5,8 @@ import numpy as np

 if os.path.isfile('evolution.npy'):
    evolution = np.load('evolution.npy').tolist()
-    print('EVOLUTION:',evolution)
\ No newline at end of file
+    for e in evolution:
+        for i in e:
+            #print without line break and round to two significant digits
+            print("{:.2f}".format(i), end=' ')
+        print()
\ No newline at end of file
--- a/modules/execute.py
+++ b/modules/execute.py
+import multiprocessing
+
+
+
 from generator import Generator, Parameters, SurrogateDataset
 from reconstruction import Reconstruction
 from surrogate import Surrogate
@@ -7,19 +11,26 @@ from matplotlib import pyplot as plt
 import torch
 import numpy as np

+if __name__ == "__main__":
+    multiprocessing.set_start_method('spawn')
+
    #so far these are only layer thicknesses
-box = np.array([0.2, 0.2, 
-                0.2, 0.2,
+    box = np.array([0.1, 0.2, 
+                    0.1, 0.2,
+                    0.1, 0.2,
+                    0.1, 0.2,
                    0.5, 1.])
    # optimisation loop
-parameters = np.array([0.5, 2.5, 
-                       0.5, 4., 
+    parameters = np.array([0.5, 3., 
+                           3.5, 4., 
+                           1.5, 3., 
+                           3.5, 5., 
                           5.5, 10.])
    
    gen = Generator(box, 
                    Parameters(len(box)),
                    n_vars = 30,
-                n_events_per_var = 1000)
+                    n_events_per_var = 200)#0)
    
    print("Number of detector parameters: ", gen.n_parameters)
    print("Number of parameters per sensor: ", gen.n_parameters_per_sensor)
@@ -34,24 +45,24 @@ reco_model = Reconstruction(gen.n_parameters,
    surrogate_model = Surrogate(gen.n_parameters, 
                           gen.n_target_parameters + gen.n_context_parameters, #context
                           gen.n_target_parameters,  #reco
-                       n_time_steps=100)
+                           n_time_steps=400)
    
    optimizer = Optimizer(gen, surrogate_model, reco_model, parameters)
    
-n_epochs_pre = 100
-n_epochs_main = 100
+    n_epochs_pre = 40
+    n_epochs_main = 60
    
-evolution = [parameters]
+    evolution = [[parameters,0.]]
    
    #if saved evolution exists, load it
    import os.path
-if os.path.isfile('evolution.npy'):
+    if False and os.path.isfile('evolution.npy'):
        evolution = np.load('evolution.npy').tolist()
        print('EVOLUTION:',evolution)
        parameters = evolution[-1]
    
    def save_evolution(evolution):
-    np.save('evolution.npy', np.array(evolution))
+        np.save('evolution.npy', np.array([p[0] for p in evolution])) #only save parameters
    save_evolution(evolution)
    #exit()
    
@@ -59,49 +70,57 @@ def make_check_plot(surr_out, reco_out, true_in, evolution ):
        '''
        make response plots for all three and save them, use evolution size as step index for output file name
        '''
-    reco_resp = reco_out.detach().cpu().numpy() - true_in.detach().cpu().numpy() - 1.
-    surr_resp = surr_out.detach().cpu().numpy() - true_in.detach().cpu().numpy() - 1.
+        reco_resp = (reco_out.detach().cpu().numpy() - true_in.detach().cpu().numpy()) / true_in.detach().cpu().numpy()
+        surr_resp = (surr_out.detach().cpu().numpy() - true_in.detach().cpu().numpy()) / true_in.detach().cpu().numpy()
    
-    range = np.min([np.min(reco_resp), np.min(surr_resp)]), np.max([np.max(reco_resp), np.max(surr_resp)])
+        #range = np.min([np.min(reco_resp), np.min(surr_resp)]), np.max([np.max(reco_resp), np.max(surr_resp)])
    
        plt.figure(figsize=(10,10))
-    plt.hist(reco_resp.flatten(), bins=21, range=range, alpha = 0.5, label='reco')
-    plt.hist(surr_resp.flatten(), bins=21, range=range, alpha = 0.5, label='surr')
-    plt.yscale('log')
+        plt.hist(reco_resp.flatten(), bins=21, range=(-2,2), alpha = 0.5, label='reco')
+        plt.hist(surr_resp.flatten(), bins=21, range=(-2,2), alpha = 0.5, label='surr')
+        plt.text(0.15, 0.8, 'step '+str(len(evolution)), transform=plt.gca().transAxes)
+        plt.text(0.15, 0.7, 'o_loss '+str((evolution[-1][1])), transform=plt.gca().transAxes)
        plt.legend()
-    plt.savefig('response_'+str(len(evolution))+'.png')
+        plt.savefig('response_'+str(len(evolution))+'_lin.png')
+        plt.yscale('log')
+        plt.savefig('response_'+str(len(evolution))+'_log.png')
        plt.close()
    
    
    for i in range(100): #just an example
        
        ds = gen.create_torch_dataset(parameters)
+    
+        print("Reconstruction training")
        if not i: #train a bit more the first time
-        reco_model.train_model(ds, batch_size=32, n_epochs= n_epochs_pre//2, lr=0.01)
+            reco_model.train_model(ds, batch_size=128, n_epochs= n_epochs_pre//2, lr=0.01)
            reco_model.train_model(ds, batch_size=128, n_epochs= n_epochs_pre//2, lr=0.001)
    
+        reco_model.to('cuda')
        reco_model.train_model(ds, batch_size=256, n_epochs= n_epochs_main, lr=0.001)
        reco_result = reco_model.apply_model_in_batches(ds, batch_size=128)
+        reco_model.to('cpu') #free up gpu memory
    
        #create the surrogate dataset
        surrogate_dataset = SurrogateDataset(ds, reco_result.detach().cpu().numpy())#important to detach here
        if not i: #train a bit more the first time
            surrogate_model.train_model(surrogate_dataset, batch_size=128, n_epochs= n_epochs_pre//2, lr=0.01)
-        surrogate_model.train_model(surrogate_dataset, batch_size=128, n_epochs= n_epochs_pre//2, lr=0.001)
+            surrogate_model.train_model(surrogate_dataset, batch_size=512, n_epochs= n_epochs_pre//2, lr=0.001)
    
-    print("Surrogate model")
-    surrogate_model.train_model(surrogate_dataset, batch_size=512, n_epochs= n_epochs_main, lr=0.001)
+        print("Surrogate training")
+        surrogate_model.train_model(surrogate_dataset, batch_size=2048, n_epochs= n_epochs_main, lr=0.001)
    
        ## check if things worked
        surr_out, reco_out, true_in = surrogate_model.apply_model_in_batches(surrogate_dataset, batch_size=1024)
        make_check_plot(surr_out, reco_out, true_in, evolution)
    
+        print("Optimization")
        #apply the surrogate model
-    updated_detector_parameters, optimal = optimizer.optimize(surrogate_dataset, batch_size=128, n_epochs=100, lr=0.01)
+        updated_detector_parameters, optimal, o_loss = optimizer.optimize(surrogate_dataset, batch_size=1024, n_epochs=100, lr=0.001)
        print("Updated detector parameters: ", updated_detector_parameters, "; not interrupted", optimal, "; change", updated_detector_parameters - parameters)
        parameters = np.abs(updated_detector_parameters)+1e-3
    
-    evolution.append(parameters)
+        evolution.append([parameters, o_loss])
    
        print('EVOLUTION:',evolution)
        save_evolution(evolution)

--- a/modules/generator.py
+++ b/modules/generator.py

-from G4Calo import GeometryDescriptor, G4System
-import random
-import sys
+
 import numpy as np
 import pandas as pd

 from torch.utils.data import Dataset, DataLoader
-import torch
 import time
-from multiprocessing import Pool
+import multiprocessing
+import os
+
+

 # make geant quiet

@@ -79,7 +79,7 @@ class Parameters(object):
        assert len(parameters) == self.N, "Wrong number of parameters!"
        self.parameters = parameters

-    def get_descriptor(self, parameters : np.ndarray = None):
+    def get_descriptor(self, parameters : np.ndarray = None, desc_class = None):
        '''
        Returns a GeometryDescriptor from the given parameters.
        Strictly takes a dict as input to ensure that the parameter names are consistent.
@@ -89,7 +89,7 @@ class Parameters(object):

        # get dict representation
        parameters = self.mapper.map_to_dict(parameters) #takes a few ms, but worth it for clarity
-        cw = GeometryDescriptor()
+        cw = desc_class()
        for i in range(self.N//2):
            cw.addLayer(parameters['thickness_absorber_'+str(i)],"G4_Pb", False)
            cw.addLayer(parameters['thickness_scintillator_'+str(i)],"G4_POLYSTYRENE",True,1)
@@ -241,23 +241,73 @@ class Generator(object):
        #create para list
        para_list = [(self.random_parameters(parameters), i) for i in range(self.n_vars)]

-        #run self.generate_single in parallel
-        with Pool() as p:
-            dfs = p.map(self.generate_single, para_list)
-
+        #custom made process handling because of Geant4 hang-ups
+        processes = []
+        queues = []
+        results = []
+        for i in range(self.n_vars):
+            queue = multiprocessing.Queue()
+            p = multiprocessing.Process(target=self.generate_single, args=(self.random_parameters(parameters), i, queue))
+            time.sleep(.1)
+            p.start()
+            processes.append(p)
+            queues.append(queue)
+
+        done = [False for i in range(self.n_vars)]
+        done_after = [-1. for _ in range(self.n_vars)]
+        start_time = time.time()
+        avg_time_per_job = 1e6
+        while not all(done):
+            sth_happened = False
+            for i, process in enumerate(processes):
+                queue = queues[i]
+                try:
+                    # Wait for the process to put data in the queue
+                    result = queue.get(timeout=.1)
+                    results.append(result)
+                    time.sleep(0.1)
+                    process.terminate() #just kill it because of geant being weird
+                    done[i] = True
+                    done_after[i] = time.time() - start_time
+                    sth_happened = True
+    
+                except multiprocessing.queues.Empty:
+                    pass
+                
+                if any(done):
+                    avg_time_per_job = np.mean(np.array(done_after)[np.array(done_after) > 0.])
+
+                #if one job takes longer than 10 times the average, kill it
+                if not done[i] and time.time() - start_time > 10. * avg_time_per_job:
+                    print('killing job', i)
+                    process.terminate()
+                    done[i] = True
+                    sth_happened = True
+            
+            if sth_happened:
+                print('done', np.sum(np.array(done)), done, done_after)
+            time.sleep(.1)
+               
+        #dangerous but keep it for now
+        os.system("rm -f *_t0.pkl")#no problem with race here are not needed
+        os.system("rm -f *_t0.root")#no problem with race here are not needed
        #concatenate the dataframes
-        return pd.concat(dfs)
+        #only select results with valid done_after time
+        results = [results[i] for i in range(len(results)) if done_after[i] > 0.]
+        return pd.concat(results)

        
-    def generate_single(self, i_in):
+    def generate_single(self, parameters, index, output_queue):
        '''
        generates a dataframe with one set of parameters.
        this should probably be moved to the Parameters class, given that there are explicit mapping dependencies
        '''
-        # sleep index ms
-        parameters, index = i_in
-        time.sleep(index/1000)
-        cw = self.par_desc.get_descriptor(parameters)
+        # import only in fork
+
+        time.sleep(index/10) #this is because the random seed in geant is based on the time in ms
+        from  G4Calo import G4System, GeometryDescriptor
+        cw = self.par_desc.get_descriptor(parameters, GeometryDescriptor)
+        
        G4System.init(cw)

        #make it more quiet; doesn't seem to work...
@@ -274,15 +324,17 @@ class Generator(object):
        df2 = G4System.run_batch(self.n_events_per_var, 'gamma', .1, 10) #DEBUG SET TO GAMMA FOR TESTING
        df2 = df2.assign(true_pid=np.array( len(df2) * [2.11], dtype='float32')) #pid is scaled by 100
        # just a dangerous workaroung
-        del cw
-        import os
-        os.system("rm *_t0.pkl")#no problem with race here are not needed
+
        # concatenate the data frames
        df = pd.concat([df, df2])
        tiled_pars = len(df)*[np.array(parameters) ] # np.tile(pars[np.newaxis, ...], [len(df), 1])
        # add the array as columns to the dataframe
        df = df.assign(detector_parameters=tiled_pars)
-        return df
+
+        output_queue.put(df)
+        print('done with generating #', index)
+        del G4System # this is important to force a reload and make it strictly local
+        return True
    


@@ -304,6 +356,7 @@ class Generator(object):
        # infer the lengths

        df = self.generate(*args, **kwargs)
+        print('done with generating')
        
        # create simple array data out of dataframe (will sit in memory, can be improved at some point)
        # this is the input to the data loader

--- a/modules/optimizer.py
+++ b/modules/optimizer.py
@@ -34,7 +34,6 @@ class Optimizer(object):
    def to(self, device):
        self.device = device
        self.surrogate_model.to(device)
-        # self.reconstruction_model.to(device) #not needed

    def other_constraints(self, dataset):
        # keep the size of the detector within 3m
@@ -63,7 +62,9 @@ class Optimizer(object):
        # create a dataloader for the dataset, this is the surrogate dataloader
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        # loop over the batches
+        mean_loss = 0
        for epoch in range(n_epochs):
+            mean_loss = 0 #only use last epoch
            for batch_idx, (_, true_inputs, true_context, reco_result) in enumerate(data_loader):
                true_inputs = true_inputs.to(self.device)
                true_context = true_context.to(self.device)
@@ -79,20 +80,24 @@ class Optimizer(object):
                loss.backward()
                #print('gradient',self.detector_parameters.grad, 'should have', self.detector_parameters.requires_grad)
                self.optimizer.step()
+                mean_loss += loss.item()
                
                #check if the parameters are still local otherwise stop
                if not self.generator.is_local(self.detector_parameters.detach().cpu().numpy()):
                    print("Parameters not local anymore. Stopping.")
                    self.clamp_parameters()
-                    return self.detector_parameters.detach().cpu().numpy(), False
+                    return self.detector_parameters.detach().cpu().numpy(), False, mean_loss / (batch_idx+1)
                
                if batch_idx % 20 == 0:
-                    print('current parameters: ', self.detector_parameters.cpu().numpy())
+                    print('current parameters: ', self.detector_parameters.detach().cpu().numpy())
+
+                
                
            print('Optimizer Epoch: {} \tLoss: {:.8f}'.format(
                        epoch, loss.item()))
        self.clamp_parameters()
-        return self.detector_parameters.detach().cpu().numpy(), True
+        mean_loss /= len(data_loader) 
+        return self.detector_parameters.detach().cpu().numpy(), True, mean_loss

    def get_optimum(self):
        return self.detector_parameters.detach().cpu().numpy()

--- a/modules/surrogate.py
+++ b/modules/surrogate.py
@@ -181,16 +181,17 @@ class Surrogate(torch.nn.Module):
                        epoch, loss.item()))
        self.eval()

-    def apply_model_in_batches(self, dataset, batch_size):
+    def apply_model_in_batches(self, dataset, batch_size, oversample=1):

        self.to()
        self.eval()
        # create a dataloader for the dataset
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        # create a tensor to store the results
-        results = torch.zeros(len(dataset), self.n_reco_parameters)
-        reco = torch.zeros(len(dataset), self.n_reco_parameters)
-        true = torch.zeros(len(dataset), self.n_reco_parameters)
+        results = torch.zeros(oversample * len(dataset), self.n_reco_parameters)
+        reco = torch.zeros(oversample * len(dataset), self.n_reco_parameters)
+        true = torch.zeros(oversample * len(dataset), self.n_reco_parameters)
+        for i_o in range(oversample):
            # loop over the batches
            for batch_idx, (detector_parameters, true_inputs, true_context, reco_inputs) in enumerate(data_loader): # the reco is not needed as it is generated here
                detector_parameters = detector_parameters.to(self.device)
@@ -200,8 +201,10 @@ class Surrogate(torch.nn.Module):
                # apply the model
                reco_surrogate = self.sample_forward(detector_parameters, true_inputs, true_context)
                # store the results
-            results[batch_idx * batch_size : (batch_idx + 1) * batch_size] = reco_surrogate
-            reco[batch_idx * batch_size : (batch_idx + 1) * batch_size] = reco_surrogate
-            true[batch_idx * batch_size : (batch_idx + 1) * batch_size] = reco_inputs
+                start_inject_index = i_o * len(dataset) + batch_idx * batch_size
+                end_inject_index = i_o * len(dataset) + (batch_idx + 1) * batch_size
+                results[start_inject_index : end_inject_index] = reco_surrogate
+                reco   [start_inject_index : end_inject_index] = reco_inputs
+                true   [start_inject_index : end_inject_index] = true_inputs

        return results, reco, true