STEER/TFndMonitorMan.cxx

00001 // @(#)fROOT/STEER:$Name:  $:$Id: TFndMonitorMan.cxx,v 1.19 2007/09/24 07:32:41 Diego_Faso Exp $
00002 // Author: Diego Faso <mailto:faso@to.infn.it>, 2006/06/01
00003 
00005 //                         TFndMonitorMan                          //
00006 //                                                                 //
00007 // Implementation of all methods needed in order to control the    //
00008 //  online monitoring process                                      //
00009 //                                                                 //
00010 // NOTE: all GUI methods are collected in the TFndMonitorGUI class //
00011 //                                                                 //
00013 
00014 #include "TROOT.h"
00015 #include "TApplication.h"
00016 
00017 #include "TFndRun.h"
00018 #include "TThread.h"
00019 #include "TFndMonitorMan.h"
00020 
00021 ClassImp(TFndMonitorMan)
00022 
00023 //_______________________________
00024 TFndMonitorMan::TFndMonitorMan():
00025   fStartProdThread(),fCheckProdThread(),fProducerCommand()
00026 {
00027   gROOT->Info("TFndMonitorMan::TFndMonitorMan","called");
00028   if(!fndrun) new TFndRun("MonitorGUI","Finuda");
00029 }
00030 
00031 //_______________________________
00032 TFndMonitorMan::~TFndMonitorMan(){
00033 
00034   StopCheckRunningThread();
00035 
00036   if(fStartProdThread){
00037     if(fStartProdThread->GetState() == TThread::kRunningState){
00038       Printf("killing \"Start new producer thread\"");
00039       fStartProdThread->Kill();
00040       TThread::Delete(fStartProdThread);
00041       delete fStartProdThread;
00042     }
00043     else Printf("Start new producer thread found, but not running"); 
00044   }
00045   else Printf("Start new producer thread not found");
00046 }
00047 
00048 //______________________
00049 void TFndMonitorMan::AutoRestartOnlineProd(Int_t mode){
00050   // mode value:
00051   //            0: clear and restart
00052   //            1: restart and recovery
00053 
00054   KillCurrentProducer();
00055   if(mode==0)  RemoveMapFiles();
00056   // ---
00057   if(fStartProdThread){
00058     if(fStartProdThread->GetState() == TThread::kRunningState){
00059       Printf("killing \"Start new producer thread\"");
00060       fStartProdThread->Kill();
00061       TThread::Delete(fStartProdThread);
00062       delete fStartProdThread;
00063     }
00064   }
00065   // ---
00066   fStartProdThread = new TThread("Monitor_producer",RunProdThread,(void *)this);
00067   
00068   gROOT->Info("SendProducerCommand","\"%s\"",fProducerCommand.Data());
00069   Printf("Sending command: \"%s\"",fProducerCommand.Data());
00070   fStartProdThread->Run();
00071   Printf("...done");
00072   
00073 }
00074 
00075 //______________________
00076 void TFndMonitorMan::StopCheckRunningThread(){
00077 
00078   if(fCheckProdThread){
00079     if(fCheckProdThread->GetState() == TThread::kRunningState){
00080       Printf("killing \"Check producer thread\"");
00081       fCheckProdThread->Kill();
00082       TThread::Delete(fCheckProdThread);
00083       delete fCheckProdThread;
00084       fCheckProdThread = 0;
00085     }
00086     else Printf("Start new producer thread found, but not running"); 
00087   }
00088   else Printf("Start new producer thread not found");
00089   
00090 }
00091 
00092 //______________________
00093 void* TFndMonitorMan::CheckRunningProdThread(void *arg){
00094   // the producer must always run in online mode.
00095   // This thread will check the geb2hdt process:
00096   //  - If online producer is terminated abnormally,
00097   //     it will restart automatically.
00098   //  - There is also limit to the maximum duration:
00099   //     too long processes will be restarted
00100 
00101   TThread::SetCancelOff();
00102   TThread::SetCancelDeferred();
00103 
00104   TDatime start_time;
00105   start_time.Set();
00106   UInt_t stim = start_time.Convert();
00107 
00108   UInt_t MaxDurationSec = 3600; // 1 hours
00109   //  UInt_t MaxDurationSec = 7200; // 2 hours
00110   //  UInt_t MaxDurationSec = 10800; // 3 hours
00111   //  UInt_t MaxDurationSec = 55; // test line
00112 
00113   while(1){
00114     TThread::CancelPoint();
00115 
00116     TDatime cur_time;
00117     cur_time.Set();
00118     UInt_t eltim = cur_time.Convert() - stim;
00119     if(eltim < 5){
00120       usleep(700000);
00121       continue;
00122     }    
00123 
00124     // --- check proces status
00125     enum pr_st{
00126       pr_st_unknown = 0,
00127       pr_st_running = 1,
00128       pr_st_stopped = 2,
00129       pr_st_broken = 3
00130     };
00131     pr_st pr_st_val = pr_st_unknown;
00132 
00133     Int_t gsys_res = gSystem->Exec("pgrep geb2hdt > /dev/null");
00134 
00135     Bool_t is_stopfile_present = kTRUE;
00136     if(gSystem->AccessPathName(".froot_onlstopped") ) is_stopfile_present = kFALSE;
00137     
00138     TString p_status = "unknown status";
00139     if(gsys_res == 0){
00140       pr_st_val = pr_st_running;
00141       p_status = "running";
00142     }
00143     else  if(gsys_res != 0 && is_stopfile_present){
00144       pr_st_val = pr_st_stopped;
00145       p_status = "stopped";
00146     }
00147     else if(gsys_res != 0 && !is_stopfile_present){
00148       pr_st_val = pr_st_broken;
00149       p_status = "broken";
00150     }
00151     
00152     // ---
00153     TString msg = "";
00154     msg.Form("*** CHECKING ONLINE PRODUCER: %s",p_status.Data());
00155     TString msg1 = "";
00156     msg1.Form(" (elapsed time: %u s)",eltim);
00157     if( pr_st_val != pr_st_stopped ) msg+=msg1;
00158     else{ // reset time evaluation if stopped
00159       start_time.Set();
00160       stim = start_time.Convert();            
00161     }
00162     
00163     if(pr_st_val != pr_st_running) Printf(msg);
00164     if(eltim > MaxDurationSec) Printf("*** Online Process too long: restarting it.");
00165     if(pr_st_val == pr_st_broken || eltim > MaxDurationSec){
00166       Printf("\n\n\n Restarting online producer (automatic check is running)");
00167       gSystem->Exec("date");
00168       ((TFndMonitorMan *)arg) -> AutoRestartOnlineProd(1);
00169       start_time.Set();
00170       stim = start_time.Convert();
00171     }
00172     //    
00173     usleep(10000000); // check timing
00174   }
00175 }
00176 
00177 //______________________
00178 void* TFndMonitorMan::RunProdThread(void *arg){
00179   // used only to start the new producer within
00180   // a separate thread
00181 
00182   Printf("executing command...");
00183   gSystem->Exec(((TFndMonitorMan *)arg)->GetProducerCommand());
00184   return 0;
00185 }
00186 
00187 
00188 //______________________
00189 void TFndMonitorMan::SendProducerCommand(TString appl,TString db_host,TString data_path,TString run_type,Int_t run_num,Int_t n_events){
00190 
00191   TString run_name = BuildRunName(run_type,run_num);
00192 
00193   fndrun->SetRunType(run_type);
00194   if(run_type.CompareTo("ONLM")) fndrun->SetRunNumber(run_num);
00195   else{
00196     fndrun->SetRunNumber(0);
00197     run_name = run_type;
00198   }
00199   
00200   //  (depracated macro) froot 'geb2hdt.C++("localhost","$RDT","FINU02181",100) ' > FINU02181.dat
00201   fProducerCommand.Form("%s %s %s %s %d %d > %s.dat" ,appl.Data(),
00202                         db_host.Data(),
00203                         data_path.Data(),
00204                         run_type.Data(),
00205                         run_num,
00206                         n_events,
00207                         run_name.Data());
00208 
00209   fStartProdThread = new TThread("Monitor_producer",RunProdThread,(void *)this);
00210 
00211   gROOT->Info("SendProducerCommand","\"%s\"",fProducerCommand.Data());
00212   Printf("Sending command: \"%s\"",fProducerCommand.Data());
00213   fStartProdThread->Run();
00214   Printf("...done");
00215 
00216   // --- check thread for producer running (online mode only!)
00217   if(run_type.CompareTo("ONLM") == 0){
00218     if(!fCheckProdThread) fCheckProdThread = new TThread("Monitor_controller",CheckRunningProdThread,(void *)this);
00219     if(fCheckProdThread->GetState() != TThread::kRunningState){
00220       Printf("Starting control for running producer");
00221       fCheckProdThread->Run();
00222       Printf("...done");
00223     }
00224   }
00225 }
00226 
00227 //______________________________________
00228 // void TFndMonitorMan::SendProducerCommand(TString appl,TString db_host,TString data_path,TString run_name,Int_t n_events){
00229 //   // the command line within the froot shell is:
00230 //   // .x geb2hdt.C++("fnddiskfe.lnf.infn.it","$RDT","FINU02181",100)
00231 //   //
00232 //   // from bash shell the command is:
00233 //   //  (depracated macro) froot 'geb2hdt.C++("localhost","$RDT","FINU02181",100) ' > FINU02181.dat
00234 //   //  (depracated macro) geb2hdt localhost $RDT FINU02181 100 > FINU02181.dat
00235   
00236 //   //#define GEB2HDT_MACRO // uncomment this line to use the (deprecated) geb2hdt.C macro
00237 //   //  RemoveMapFiles();
00238 
00239 //   // #ifdef GEB2HDT_MACRO
00240 //   //   fProducerCommand.Form("froot \'%s/%s.C++(\"%s\",\"%s\",\"%s\",%d) \' > %s.dat",FROOT::ExpandPathName("$FROOTSYS/mcr_called").Data(),appl.Data(),db_host.Data(),data_path.Data(),run_name.Data(),n_events,run_name.Data());
00241 //   // #else
00242 //   fProducerCommand.Form("%s %s %s %s %d > %s.dat" ,appl.Data(),db_host.Data(),data_path.Data(),run_name.Data(),n_events,run_name.Data());
00243 //   // #endif
00244   
00245 //   fStartProdThread = new TThread("Monitor_producer",RunProdThread,(void *)this);
00246 
00247 //   gROOT->Info("SendProducerCommand","\"%s\"",fProducerCommand.Data());
00248 //   Printf("Sending command: \"%s\"",fProducerCommand.Data());
00249 //   fStartProdThread->Run();
00250 //   Printf("...done");
00251 
00252 // }
00253 
00254 //_________________________________________
00255 void TFndMonitorMan::KillCurrentProducer(){
00256   // NOTE: not only current process, but all
00257   // active processes with the same name are killed
00258 
00259   Printf("Killing current process...");
00260 
00261   TString pid_kill = TString("kill -9 ");
00262 
00263   FILE *f = fopen(ExpandPathName(".froot_pid"),"r");
00264   Char_t file[200][200];
00265   if(!f){
00266     gROOT->Warning("TFndMonitorMan::KillCurrentProducer()","PID file not found");
00267     return;
00268   }
00269 
00270   TString cur_line = TString();
00271   int l=0;
00272   const Char_t *prod_pid = 0;
00273   while (fgets(file[l],200,f)) { 
00274     Char_t *line=file[l++];
00275     if(strlen(line)<2) continue;
00276     line[strlen(line)-1]='\0';
00277     cur_line = line;
00278     Int_t SpacePos = (Int_t) cur_line.Index(" ");
00279     if(!strncmp(cur_line.Data(),"prod:",SpacePos)){ 
00280       prod_pid = ((TSubString)(cur_line.SubString("",SpacePos+1))).Data(); // read content after space
00281     }
00282   }
00283   pid_kill += prod_pid;
00284   Printf(" ******* Killing current producer: sending command: \"%s\" *******",(Char_t *)(pid_kill.Data()));
00285   gSystem->Exec(pid_kill.Data());
00286   Printf("                        ******* current producer killed *******");
00287   fclose(f);
00288   //  delete f; // do not delete f
00289 
00290   //  gSystem->Exec("pkill \'geb2hdt fnddiskfe.lnf.infn.it $PWD ONLM\'");
00291 
00292 }
00293 
00294 //______________________________________________________________________________
00295 void TFndMonitorMan::KillOtherProducers(){
00296   // all active geb2hdt in background are killed
00297   // they must have been executed in thgis way:
00298   // froot -q -b geb2hdt.C
00299 
00300   gSystem->Exec("pkill froot,geb2");
00301 
00302 }
00303 
00304 //______________________________________________________________________________
00305 void TFndMonitorMan::KillYourself(){
00306   // To be completed
00307 
00308   //  HandleMenu(M_CLOSE_ALL); // close consumers (displays)
00309   cout << "Killing application" << endl;
00310   gApplication->Terminate(0);
00311 }
00312 
00313 //______________________________________________________________________________
00314 void TFndMonitorMan::RemoveMapFiles(){
00315   // used shared memories (*.map) must be removed,
00316   // otherwise next time you run the online consumer
00317   // monitor you could get old histograms, since new shared
00318   // memories are not created until the socket connection is established
00319 
00320   //  gSystem->ProcessEvents();
00321   cout << "... removing used shared memories..." << endl;
00322   gSystem->Exec("rm -f $FND_SHR/*.map");
00323   gSystem->Exec("rm ./*.map");
00324   cout << "...done." << endl;
00325 }

Generated on Tue Oct 16 15:40:48 2007 by  doxygen 1.5.2