
/*
Copyright 2007, The Regents of the University of California
All Rights Reserved

Permission to use, copy, modify and distribute any part of this
program for educational, research and non-profit purposes, without fee,
and without a written agreement is hereby granted, provided that the
above copyright notice, this paragraph and the following three paragraphs
appear in all copies.

Those desiring to incorporate this work into commercial
products or use for commercial purposes should contact the Technology
Transfer & Intellectual Property Services, University of California,
San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent@ucsd.edu.

IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.

THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
ENHANCEMENTS, OR MODIFICATIONS.  THE UNIVERSITY OF CALIFORNIA MAKES NO
REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
*/

#include "RegularRankModel.h" 
#include "FileManagement.h"
#include "DeNovoDp.h"
#include "DeNovoSolutions.h"
#include "auxfun.h"
#include "includes.h"

void print_help(char *message)
{
	printf("%s\n",message);

	printf("\nPepNovo v2.00 - de Novo peptide sequencing.\nAll rights reserved to the Regents of the University of California.\n\n");
	
	printf("Required arguments\n");
	printf("-model <model name>\n");
	printf("-file <path to input file>  - PepNovo can analyze dta,mgf and mzXML files\n");
	printf("   or\n");
	printf("-list <path to text file listing input files>\n");
	printf("Optional arguments: \n");
	printf("------------------- \n");
	
	printf("-prm_only  - only print spectrum graph nodes with scores\n");
//	printf("-pmcsqs_only  - only output the corrected precursor mass, charge and filtering values\n");
//	printf("-pmcsqs_and_prm <min prob> - print spectrum graph nodes for spectra that have an SQS probability score of at least <min prob> (typically should have a value 0-0.2)\n");
	printf("-try_multiple_pms  - attempts multiple precursor mass values (for +1,+2 Da isotopic errors)\n");

	printf("-fragment_tolerance < 0-0.75 > - the fragment tolerance (each model has a default setting)\n");
	printf("-pm_tolerance < 0-4.0 > - the precursor masss tolerance (each model has a default setting)\n");

	printf("-PTMs   <PTM string> - seprated  by a colons (no spaces) e.g., M+16:S+80:N+1\n");	
	printf("-digest <NON_SPECIFIC,TRYPSIN> - default TRYPSIN\n");
	printf("-num_solutions < number > - default 20\n");
	printf("-min_length < number > - minimal number of amino acids in predicted sequence (default 5)\n");
	printf("-max_length < number > - maximal number of amino acids in predicted sequence (default 10)\n");
	printf("-model_dir  < path > - directory where model files are kept (default ./Models)\n\n");



//	printf("For model training: (see documentation for details)\n");
//	printf("-------------------\n");
	
//	printf("-train_model <training tolerance>\n");
//	printf("-initial_model <model name> \n");

	printf("\nCitations:\n");
	printf(  "----------\n");
	printf("- Frank, A. and Pevzner, P. \"PepNovo: De Novo Peptide Sequencing via Probabilistic Network Modeling\", Analytical Chemistry 77:964-973, 2005.\n");
	printf("- Frank, A., Tanner, S., Bafna, V. and Pevzner, P. \"Peptide sequence tags for fast database search in mass-spectrometry\", J. Proteome Res. 2005 Jul-Aug;4(4):1287-95.\n");
	printf("- Frank, A.M., Savitski, M.M., Nielsen, L.M., Zubarev, R.A., Pevzner, P.A. \"De Novo Peptide Sequencing and Identification with Precision Mass Spectrometry\", J. Proteome Res. 6:114-123, 2007.\n");

	printf("\nPlease send comments and bug reports to Ari Frank (arf@cs.ucsd.edu).\n\n");
	exit(1);
}

int main(int argc, char **argv) 
{ 
	RegularRankModel model;

	int i;
	char input_file[256];
	char list_file[256];
	char model_file[256];
	char initial_model[256];
	char model_dir[256];
	char PTM_string[512];
	
	bool got_input_file=false,got_model_file=false, got_list_file=false;
	bool got_model_dir=false, got_initial_model=false, got_PTM_string = false;
	bool prm_only=false;
	bool pmcsqs_only = false;
	bool pmcsqs_and_prm = false;
	bool train_flag = false;
	bool try_multiple_pms = false;
	
	int min_length = 5;
	int max_length = 10;
	int num_solutions = 20;
	int digest_type = TRYPSIN_DIGEST;
	mass_t train_tolerance;
	float min_pmcsqs_prob = -1.0;
	mass_t fragment_tolerance = -1.0;
	mass_t pm_tolerance = -1.0;

	rand_seed(112233);



	// read command line arguments
	i=1;
	while (i<argc)
	{
		if (!strcmp(argv[i],"-file"))
		{
			if (++i == argc)
				print_help("Missing file name!");

			strcpy(input_file,argv[i]);
			got_input_file=true;
		}
		else
		if (!strcmp(argv[i],"-list"))
		{
			if (++i == argc)
				print_help("Missing list name!");

			strcpy(list_file,argv[i]);
			got_list_file=true;
		} 
		else if (!strcmp(argv[i],"-model")) 
		{
			if (++i == argc)
				print_help("Missing model name!");

			strcpy(model_file,argv[i]);
			got_model_file=true;
		}
		else if (! strcmp(argv[i],"-model_dir"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			strcpy(model_dir,argv[i]);
			got_model_dir=true;
		}
		else if (! strcmp(argv[i],"-fragment_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			fragment_tolerance = atof(argv[i]);
			if (fragment_tolerance<0 || fragment_tolerance>0.75)
				print_help("Error: -fragment_toelerance should be 0-0.75\n");
		}
		else if (! strcmp(argv[i],"-pm_tolerance"))
		{
			if (++i == argc)
				print_help("Missing model dir name!");

			pm_tolerance = atof(argv[i]);
			if (pm_tolerance<0 || pm_tolerance>4)
				print_help("Error: -pm_toelerance should be 0-4.0\n");
		}
		else if  (!strcmp(argv[i],"-num_solutions"))
		{
			if (++i == argc)
				print_help("Missing number of solutions!");

			num_solutions = atoi(argv[i]);
			if (num_solutions<=0 || num_solutions> 1000)
				print_help("Error: -num_solutions should be 1-1000\n");
		}
		else if (!strcmp(argv[i],"-min_length"))
		{
			if (++i == argc)
				print_help("Missing minimum length parameter!");

			min_length = atoi(argv[i]);
			if (min_length<3 || min_length>40)
				print_help("Error: -min_length value be 3-40\n");

		}
		else if (!strcmp(argv[i],"-max_length"))
		{
			if (++i == argc)
				print_help("Missing maximum length parameter!");

			max_length = atoi(argv[i]);
			if (max_length<3 || max_length>40)
				print_help("Error: -max_length value should be 3-40\n");
		}
		else if (!strcmp(argv[i],"-digest"))
		{
			if (++i == argc)
				print_help("Missing digest type parameter : NON_SPECIFIC, TRYPSIN\n");

			if (! strcmp(argv[i],"NON_SPECIFIC"))
			{
				digest_type = NON_SPECIFIC_DIGEST;
			}
			else if (! strcmp(argv[i],"TRYPSIN"))
			{
				digest_type = TRYPSIN_DIGEST;
			}
			else
			{
				printf("Error: bad digest type: %s\n",argv[i]);
				print_help("Supported digest types: NON_SPECIFIC, TRYPSIN.");
			}
		}
		else if (! strcmp(argv[i],"-try_multiple_pms"))
		{
			try_multiple_pms = true;
		}
		else if (! strcmp(argv[i],"-prm_only"))
		{
			prm_only = true;
		}
		else if (! strcmp(argv[i],"-pmcsqs_only"))
		{
			pmcsqs_only = true;
		}
		else if ( ! strcmp(argv[i],"-pmcsqs_and_prm")) 
		{
			pmcsqs_and_prm = true;
			if (++i == argc)
				print_help("Missing minimum probability parmater after -pmsqs_and_prm !");

			min_pmcsqs_prob = -1.0;
			min_pmcsqs_prob = atof(argv[i]);
			if (min_pmcsqs_prob<0.0 || min_pmcsqs_prob>1.0)
			{
				printf("The flag -pmcsqs_and_prm should be followed by a minimal probability value [0-1.0]\n");
				exit(1);
			}

		}
		else if (! strcmp(argv[i],"-train_model"))
		{
			train_flag = true;
			if (++i == argc)
				print_help("Missing training tolerance!");

			train_tolerance = atof(argv[i]);
			if (train_tolerance<0.001 || train_tolerance>1.0)
				print_help("Error: training tolerance should be in the range 0.001 - 1.0\n");
		}
		else if (! strcmp(argv[i],"-initial_model"))
		{
			got_initial_model = true;
			if (++i == argc)
				print_help("Missing initial model name!");
			strcpy(initial_model,argv[i]);
		}
		else if (! strcmp(argv[i],"-PTMs"))
		{
			got_PTM_string = true;
			if (++i == argc)
				print_help("Missing PTM list!");
			strcpy(PTM_string,argv[i]);
		}
		else
		{
			printf("Error: Unkown command line option: %s\n\n",argv[i]);
			print_help("");
			exit(0);
		}
		i++;
	}


	if (! got_model_file) 
		print_help("Error: Missing model name!");


	if ( ! got_input_file && ! got_list_file)
		print_help("Error: missing input file (either -file or -list must be used).");

	Config *config = model.get_config();

	if (got_model_dir)
	{
		config->set_resource_dir(string(model_dir));
	}


	//////////////////////////////////////////////////////////////////
	// Model Training
	if (train_flag)
	{	
		
		if (got_initial_model)
		{
			model.read_model(initial_model);
		}
		else
			config->init_with_defaults();

		if (got_PTM_string)
			config->apply_selected_PTMs(PTM_string);

		model.set_model_name(string(model_file));
		config->set_tolerance(train_tolerance);
		config->set_digest_type(digest_type);
		
		FileManager fm; 

		if (! got_list_file)
		{
			if (got_input_file)
			{
				fm.init_from_mgf(config,input_file);
			}
			else
			{
				printf("Must supply a list of annotated spectra for training!\n");
				exit(0);
			}
		}
		else
			fm.init_from_list_file(config,list_file);


		model.full_train_model(model_file,fm,train_tolerance);

		model.write_model();
		exit(0);
	}
	
	///////////////////////////////////////////////////////////////////
	// Model initializing (running some sort of de novo, need a model)
	// 
	model.read_model(model_file);

	if (got_PTM_string)
		config->apply_selected_PTMs(PTM_string);

	if (try_multiple_pms)
		config->set_try_multiple_pms(1);

	config = model.get_config();
	config->set_digest_type(digest_type);

	if (fragment_tolerance>0)
		config->set_tolerance(fragment_tolerance);

	if (pm_tolerance>0)
		config->set_pm_tolerance(pm_tolerance);
	

	//////////////////////////////////////////////////////////////////
	// read pmc sqs models
/*	if (pmcsqs_only || config->get_need_to_estimate_pm() )
	{
		if (! (pmcsqs.read_pmc_models(config) && 
			   pmcsqs.read_sqs_models(config) ) )
		{
			cout << "Error: could not find PMC and SQF models for " << config->get_model_name() << endl;
			cout << "Cannot perform precursor mass correction and charge determiniation!" << endl;
			exit(1);
		}
	}*/

 
	///////////////////////////////////////////////////////////////////
	// Make input file list
	vector<string> list_vector;
	if (got_list_file)
	{
		read_paths_into_list(list_file, list_vector);
	}
	else
		list_vector.push_back(input_file);

	int correct_benchmark =0;
	int total_benchmark =0;

	///////////////////////////////////////////////////////////////////
	// iterate on input files
	int f;
	for (f=0; f<list_vector.size(); f++) 
	{
	
		const char *spectra_file = list_vector[f].c_str();
		FileManager fm;
		FileSet fs;
		BasicSpecReader bsr;

		///////////////////////////////////////////////
		// Quick read, get all pointers to begining of spectra
		if (get_file_extension_type(list_vector[f]) != MZXML)
		{
			fm.init_from_file(config,spectra_file);
		}
		else // reads peaks 
			fm.init_and_read_single_mzXML(config,spectra_file,f);

		fs.select_all_files(fm);

		const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
		int sc;
		for (sc=0; sc<all_ssf.size(); sc++)
		{
			static vector<QCPeak> peaks;
			SingleSpectrumFile *ssf = all_ssf[sc];
			if (peaks.size()<ssf->num_peaks)
			{
				int new_size = ssf->num_peaks*2;
				if (new_size<2500)
					new_size=2500;
				peaks.resize(new_size);
			}

			int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
			ssf->file_idx = f;

			// convert peak list ot a spectrum with charge (if original charge ==0)
			// the spectrum gets charge 2, but the true charge is computed from the data
		
			Spectrum s;
			s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);

			vector<SeqPath> solutions;
			solutions.clear();

	
			if ( ssf->charge > model.get_max_score_model_charge())
			{
				ssf->print_ssf_stats(config);
				cout << "# Charge " << s.get_charge() << " not supported yet..." << endl << endl;
				continue;
			}


			// Simple de novo or PRM, no filtering needed
			if (! config->get_try_multiple_pms() && 
				! config->get_need_to_estimate_pm() &&
				! pmcsqs_only && 
				! pmcsqs_and_prm &&
				ssf->charge>0 )
			{
				
				if (prm_only)
				{
					ssf->print_ssf_stats(config);
					print_prm_graph_scores(&model,&s,s.get_org_pm_with_19(),s.get_charge());
					continue;
				}
				else
				{
					generate_denovo_solutions(&model,&s,false,s.get_org_pm_with_19(),s.get_charge(),
						num_solutions,min_length,max_length,solutions);
				}
			}

			// more complicated case, need to perform some sort of filtering
			else  
			{
				vector<mass_t> pms_with_19;
				vector<int>    charges;
				pms_with_19.clear();
				charges.clear();

				if (config->get_need_to_estimate_pm() || 
					pmcsqs_only || 
					pmcsqs_and_prm ||
					ssf->charge == 0)
				{
					mass_t	mz1,mz2;
					int		charge1,charge2;
					float	prob1,prob2;s;
					
					BasicSpectrum bs;
					bs.ssf = ssf;
					bs.peaks = &peaks[0];
					bs.num_peaks = num_peaks;

					// output m/z and prob values for the different charge states
					model.get_best_mz_charge(config,bs, 
						&mz1,&charge1,&prob1,&mz2,&charge2,&prob2);

					mass_t corr_pm_with_19   =  mz1*charge1 - (charge1-1);
					mass_t second_pm_with_19 =  -1;
					if (mz2>0 && mz2<5000)
						second_pm_with_19=mz2*charge2 - (charge2-1);

					if (pmcsqs_only)
					{
						ssf->print_ssf_stats(config);
						cout << setprecision(3) << fixed;
						cout << charge1 << "\t" << prob1 << "\t" << corr_pm_with_19 << "\t";
						if (second_pm_with_19>0)
							cout << charge2 <<  "\t" << prob2 << "\t" << second_pm_with_19 << "\t";
						cout << endl << endl;

						continue;
						
					}

					// init a spectrum s according to the pmcsqs
					// calculations

					if (pmcsqs_and_prm)
					{
						if (prob1>=min_pmcsqs_prob)
						{
							ssf->print_ssf_stats(config);
							print_prm_graph_scores(&model,&s,corr_pm_with_19,charge1);
						}
						continue;
					}
					
				
				
					if (prm_only)
					{
						ssf->print_ssf_stats(config);
						print_prm_graph_scores(&model,&s,corr_pm_with_19,charge1);
						continue;
					}
				
					// calculate the optimal pm_with_19
					// this will be used for the denovo sequencing

					{
						PrmGraph prm;
						bool update_org_pm_with_19 = false;
						if (s.get_charge() != charge1 || ssf->charge==0)
						{
							s.set_charge(charge1);
							update_org_pm_with_19 = true;
						}

						model.init_model_for_scoring_spectrum(&s);
						corr_pm_with_19 = prm.find_optimal_pm_with_19_for_graph(&model,
							&s,corr_pm_with_19,s.get_charge());

						if (update_org_pm_with_19)
							s.set_org_pm_with_19(corr_pm_with_19);
					}

					pms_with_19.push_back(corr_pm_with_19);
					charges.push_back(charge1);

					if (config->get_try_multiple_pms())
					{
						pms_with_19.push_back(second_pm_with_19);
						charges.push_back(charge2);
					}
				}

				
				// use skips of +-1
				if (config->get_try_multiple_pms())
				{
					if (config->get_pm_tolerance()<0.1)
					{
						int i;
						for (i=0; i<=3; i++)
						{
							pms_with_19.push_back(s.get_org_pm_with_19()-i*1.0023);
							charges.push_back(s.get_charge());
						}
					}
					else
					{
						pms_with_19.push_back(s.get_org_pm_with_19());
						charges.push_back(s.get_charge());
					}
				}

				
				if (pms_with_19.size()==0)
				{
					pms_with_19.push_back(s.get_org_pm_with_19());
					charges.push_back(s.get_charge());
				}
			
				// by now we might have a list of several charges, M+H to examine
				// Find the pooled results
				
				generate_denovo_solutions_from_several_pms(&model,&s,
					false, // tags
					num_solutions,
					min_length,
					max_length,
					pms_with_19,
					charges,
					solutions);
			}

			////////////////////////////////////////////////////////////
			// if we are here it is only for denovo/tags
			// print results
			////////////////////////////////////////////////////////////

			
			bool had_pep = false;
			bool had_correct = false;

			ssf->print_ssf_stats(config);

			if (solutions.size() == 0)
			{
				cout << "No solutions found." << endl;
			}
			else 
			{
				cout << "#Index\tProb\tScore\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
				int i; 	
				for (i=0; i<solutions.size(); i++) 
				{
					mass_t c_gap=solutions[i].pm_with_19 - solutions[i].c_term_mass;
					if (c_gap<24.0)
						c_gap = 0;

					cout << setprecision(3) << fixed << i << "\t";
					cout << solutions[i].seq_prob << "\t";
					cout << solutions[i].path_score << "\t";
					cout << solutions[i].n_term_mass << "\t";
					cout << c_gap << "\t";
					cout << solutions[i].pm_with_19 << "\t";
					cout << solutions[i].charge << "\t";
					cout << solutions[i].seq_str;	

					if (ssf->peptide.get_num_aas()>2)
					{
						if (solutions[i].check_if_correct(ssf->peptide.as_string(config),config))
						{

							cout << " *";

							if (! had_correct)
							{
								correct_benchmark++;
								had_correct=true;
							}
						}
						had_pep=true;
					}
					cout << endl;
				}
			}

			if (had_pep) // for annotated spectra (benchmark)
				total_benchmark++;

			cout << endl;
		}
	}

	/////////////////////////////////////////////////////////////////
	// this part works only if the spectra are annotated (benchmark)
	/////////////////////////////////////////////////////////////////
	if (total_benchmark>0)
	{
		cout << "Correct spectra " << correct_benchmark << "/" << total_benchmark << " (" <<
			fixed << setprecision(3) << (double)correct_benchmark/(double)total_benchmark << ")" << endl;
	}

	return 0;
}



