#include "QuickClustering.h"
#include "RegularRankModel.h"
#include "PMCSQS.h"
#include "auxfun.h"

// the sim matrix stores the similarity distances computed between clusters
int     num_sim_matrix_spectra = 0;
unsigned char  * sim_matrix = NULL;
unsigned char  * max_sim_addr = NULL;

const float MIN_SQS_PROB = 0.1;



void print_byte(unsigned char byte)
{
	int i;
	unsigned char mask=0X1;

	for (i=0; i<8; i++)
	{
		cout << ( (byte & mask) ? '1' : '0');
		mask <<= 1;
	}
}


void mark_bit_zero(unsigned char *addr, int position)
{
	const unsigned char ANDmasks[]={254,253,251,247,239,223,191,127};
	int bit_offset = (position & 0X7);
	int byte_offset = (position >> 3);
	
	*(addr+byte_offset) &= ANDmasks[bit_offset];
}


void mark_bit_one(unsigned char *addr, int position)
{
	const unsigned char ORmasks[] ={1,2,4,8,16,32,64,128};
	int bit_offset = (position & 0X7);
	int byte_offset = (position >> 3);

	*(addr+byte_offset) |= ORmasks[bit_offset];
}


// returns full int assumes we are at position 31 in the 4 bytes
int get_matrix_32_bits(unsigned char *row_start, int position)
{
	int cell_off = (position >> 5);
	return (*((int *)row_start+cell_off)); 
//	return 1;
}

int get_matrix_val(unsigned char *row_start, int position)
{
	const unsigned char masks[] ={1,2,4,8,16,32,64,128};
	int bit_offset = (position & 0X7);
	int byte_offset = (position >> 3);
	
	return ((*(row_start+byte_offset) & masks[bit_offset]));
//	return 1;
}


/***************************************************************************
	This function creates clusters from a list of files containing spectra
	(possibly different file types).
	The cluster spectra are outputted as mgf files in the output dir (x spectra
	per file). In addition, for each cluster file there is a map file that holds
	the indices (position in list, and idx in file) of the original spectra
	that are part of the cluster.
****************************************************************************/
void cluster_full_dataset(Config *config,
							  char *list_file,
							  const string& out_dir,
							  const string& clust_name,
							  int batch_idx,
							  int specs_per_slice,
							  mass_t min_m_over_z,
							  mass_t max_m_over_z,
							  float  min_similarity, 
							  int min_cluster_size,
							  bool verbose,
							  int  max_small_cluster_size,
							  int  k_value,
							  void *pmcsqs)
{
	const mass_t pm_tolerance = config->get_pm_tolerance();
	const mass_t double_pm_tolerance = pm_tolerance * 2.0;
	const mass_t additional_pm_tolerance = pm_tolerance * 2.5;
	const mass_t tolerance = (config->get_tolerance() <= 0.01) ? config->get_tolerance() :
							  config->get_tolerance() * 0.75;
	FileManager fm;
	FileSet all_spec_fs;
	vector<QCPeak> basic_peaks;      // bulk allocation

	// for cluster histograms
	int clust_vals[]={1,2,5,10,20,50,100,200,500};
	const int  num_clust_vals = sizeof(clust_vals)/sizeof(int);
	vector<int> clust_counts;

	clust_counts.resize(num_clust_vals+1,0);

	double avg_cluster_size=0;
	int num_clusters=0;
	int total_spectra_in_clusters = 0;
	int spec_idx=0;
	int next_cluster_idx = 0;


	QCOutputter qco;
	qco.init(batch_idx,clust_name,out_dir, min_m_over_z, max_m_over_z,
			 min_similarity, min_cluster_size);

	ostringstream oss;
	oss << batch_idx;
	string batch_str = oss.str();
	string last_good_mass_name = out_dir + "/" + clust_name + "_" + batch_str + "_lastmass.txt";

	ClusterSpectrum::init_statics(config);

	ClusterSpectrum::set_num_top_peaks_per_1000_da(k_value);

	fm.init_from_list_file(config,list_file,min_m_over_z,max_m_over_z);
	all_spec_fs.select_all_files(fm,true);
	all_spec_fs.sort_according_to_m_over_z();
	const int total_spectra = all_spec_fs.get_total_spectra();
	const vector<SingleSpectrumFile *>& all_ssf = all_spec_fs.get_ssf_pointers();

	if (all_ssf.size()==0)
	{
		cout <<"Warning: no files were selected for clusering in mass range: " <<
			min_m_over_z << " - " << max_m_over_z << endl;
		return;
	}
	
	int total_spectra_read=0;
	int total_mismatched = 0;

	// set the sizes for the static arrays of peaks, sim matrix and determine
	// the maximal slice size for the clustering
	// vlaues are set according to defaults of:
	// 4M spectra
	// 4M peaks
	// Slice of 20000 spectra per iteration
	// Slice width 3 Da.
	

	float multiplier = (float)specs_per_slice / 20000.0;

	int total_spectra_to_cluster = all_ssf.size();
	int num_total_peaks = (int)(4000000 * multiplier); 
	int slice_size      =   specs_per_slice;
	int sim_matrix_size =   ((specs_per_slice+7) / 8);
	int sim_matrix_bytes = sim_matrix_size * sim_matrix_size * 4 + specs_per_slice;




	cout << "Need to allocate following memory: " << endl;
	cout << "peaks            " << right << num_total_peaks << endl;
	cout << "spectrum headers " << right << slice_size << endl;
	cout << "sim matrix       " << right << sim_matrix_bytes << endl;
	cout << "Using " << k_value << " peaks per 1000 Da for similarity computations." << endl;

	basic_peaks.resize(num_total_peaks+20000);

	sim_matrix = new unsigned char [sim_matrix_bytes];
	max_sim_addr = sim_matrix + sim_matrix_bytes;

	if (! sim_matrix)
	{
		cout << "Error: couldn't allocate memory for sim matrix!" << endl;
		exit(1);
	}

	double total_sims =0;

	// collect spectra into two ssf vectors
	while (spec_idx<total_spectra)
	{
		const mass_t max_m_over_z = all_ssf[spec_idx]->m_over_z + double_pm_tolerance;
		const mass_t additional_m_over_z = all_ssf[spec_idx]->m_over_z + additional_pm_tolerance;
		int num_used_peaks=0;
		int num_peaks_first_stage = -1;
		int start_idx = spec_idx;
		int end_stage_one_idx = -1;
		int end_additional_idx = -1;

		int start_spec_idx = spec_idx;
		bool add_additional_spectra = false; // flag if spectra from the next mass range
											 // should be added at the later stage
		int i;

		// add spectra wto the 
		int max_spec_idx = spec_idx + slice_size;
		if (max_spec_idx>total_spectra)
			max_spec_idx = total_spectra;

		while (spec_idx < max_spec_idx && 
			   num_used_peaks<num_total_peaks && 
			   all_ssf[spec_idx]->m_over_z<max_m_over_z)
					num_used_peaks+=all_ssf[spec_idx++]->num_peaks;

		end_stage_one_idx = spec_idx-1;
		num_peaks_first_stage = num_used_peaks;

		if (spec_idx < max_spec_idx)
		{
			add_additional_spectra = true;

			while (spec_idx < max_spec_idx && 
			   num_used_peaks<num_total_peaks && 
			   all_ssf[spec_idx]->m_over_z<additional_m_over_z)
					num_used_peaks+=all_ssf[spec_idx++]->num_peaks;

			end_additional_idx = spec_idx-1;
		}

		FileSet cluster_fs, additional_fs;
		cluster_fs.init_from_another_fs(all_spec_fs,start_idx,end_stage_one_idx);

		if (add_additional_spectra)
			additional_fs.init_from_another_fs(all_spec_fs,end_stage_one_idx+1,end_additional_idx);

		vector<ClusterSpectrum> clusters;
		clusters.clear();

		cout << fixed << setprecision(3) << "Clustering: " << all_ssf[start_idx]->m_over_z << " - " << 
			(add_additional_spectra ? all_ssf[end_additional_idx]->m_over_z : 
					all_ssf[end_stage_one_idx]->m_over_z )  << "  (" <<
			spec_idx - start_spec_idx  << "  spectra,  " << num_used_peaks << " peaks)" << endl;

		if (0)
		{
			double n_spec = spec_idx - start_spec_idx;
			double round_sims = (n_spec -1)*n_spec /2;
			
			total_sims += round_sims;
			cout << "Sims: " << round_sims;
			spec_idx = end_stage_one_idx + 1;
			continue;
		}



		int num_in_clusters=0;
		total_spectra_read+= cluster_spec_in_file_set(
										config, 
										fm, 
										cluster_fs, 
										tolerance,
										&basic_peaks[0], 
										clusters, 
										min_similarity, 
										max_small_cluster_size,
										k_value,
										false,
										true,
										true,
										pmcsqs); 

		

		// join singletons from the next half of the clustering window

		if (add_additional_spectra && additional_fs.get_total_spectra()>0) 
		{
			int num_added =  add_additional_spectra_to_existing_clusters(config,fm,additional_fs,tolerance,
				&basic_peaks[num_peaks_first_stage], clusters, min_similarity,
				false);

			total_spectra_read += num_added;
		} 

		// update cluster info
		
		for (i=0; i<clusters.size(); i++)
		{
			if (clusters[i].get_tmp_cluster_idx()<0)
				continue;

			if	(clusters[i].get_num_basic_spectra()<min_cluster_size)
				continue;

			// check if sqs is high enough
			if (pmcsqs && clusters[i].get_num_basic_spectra() == 1)
			{
			//	cout << "SQS: " << clusters[i].get_basic_spectrum(0).ssf->sqs << endl;
				if (clusters[i].get_basic_spectrum(0).ssf->sqs<MIN_SQS_PROB )
					continue;
			}

			total_spectra_in_clusters += clusters[i].get_num_basic_spectra();

			clusters[i].set_charge();
			clusters[i].set_cluster_m_over_z();
	
			qco.output_cluster_spectrum(clusters[i]);

			if (verbose)
			{
				cout << num_clusters<< " " << clusters[i].get_num_basic_spectra() << endl;
				clusters[i].print_cluster_similarities();
				cout << endl;
			}

			const int num_spec_in_cluster = clusters[i].get_num_basic_spectra();
			if (num_spec_in_cluster>=min_cluster_size)
			{
				num_clusters++;
				avg_cluster_size += num_spec_in_cluster;
			}

			// add counts to histogram
			int j;
			for (j=0; j<num_clust_vals; j++)
				if (num_spec_in_cluster<= clust_vals[j])
					break;
			clust_counts[j]++;

			// check for mixed clusters
			int n_mis = clusters[i].get_num_misassigned_spectra();
			total_mismatched += n_mis;
			if (n_mis>0)
				clusters[i].print_cluster_peptides();
		}
		spec_idx = end_stage_one_idx + 1; // go back to the end of stage one

		// update the file which holds the last mass clustered
		ofstream last_mass_stream(last_good_mass_name.c_str(),ios::out);
		last_mass_stream << fixed << setprecision(3) << max_m_over_z << endl;
		last_mass_stream.close();
	}

	
	if (sim_matrix)
		delete [] sim_matrix;

	cout << endl << endl << "Total spectra read and clustered: " << total_spectra_read << " (" <<
		total_spectra << ")" << endl;

	cout << "# spectra in clusters: " << total_spectra_in_clusters << " (" <<
		setprecision(3) << (double)total_spectra_in_clusters / (double)total_spectra << ")" << endl;

	cout << "# clusters: " << num_clusters << "  , " << "Avg cluster size: " << 
		avg_cluster_size/(double)num_clusters << endl;

	cout << "Total mismatched spectra: " << total_mismatched << "  (" <<
		(double)total_mismatched/total_spectra_read << ")" << endl;

	// cluster histogram
	cout << "Histogram of clusters: " << endl;
	cout << "max size     count" << endl;
	int i;
	for (i=0; i<num_clust_vals; i++)
	{
		cout << setw(8) << left << clust_vals[i] << clust_counts[i] << endl;
	}
	cout << ">" << setw(7) << left << clust_vals[num_clust_vals-1] <<
			clust_counts[num_clust_vals] << endl;

}





/**********************************************************************
	Creates cluster spec for the set of basic spectra.
	First reads the spectra and copies the peaks into the bulk peak allocation
	returns number of spectra actually read (does not read spectra that
	were already assigned to a cluster).

	The clustering is done in two phases. First a tight distance threshold
	is implemented, and in the second phase it is relaxed (this way the clusters
	should be more homegneous).
***********************************************************************/
int cluster_spec_in_file_set(Config *config, 
							 const FileManager& fm, 
							 FileSet& cluster_fs,
							 mass_t tolerance,
							 QCPeak *basic_peaks, 
							 vector<ClusterSpectrum>& clusters, 
							 float min_similarity,
							 int   max_small_cluster_size,
							 int   num_top_peaks_per_1000_da,
							 bool verbose,
							 bool use_remember,
							 bool use_top7_test,
							 void *pmcsqs_ptr)
{
	// set clustering similarity thresholds
	
	vector<float> similarity_vals;
	int   num_rounds;
	if (min_similarity >= 0.9)
	{
		similarity_vals.push_back(min_similarity);
		num_rounds=1;
	}
	else
	{
		similarity_vals.push_back(0.9);
		if (min_similarity>=0.8)
		{
			similarity_vals.push_back(min_similarity);
			num_rounds=2;
		}
		else
		{
			similarity_vals.push_back((min_similarity+0.9)/2.0);
			similarity_vals.push_back(min_similarity);
			num_rounds=3;
		}
	}

	const float min_similarity_thresh = (min_similarity <0.2 ? min_similarity : 0.2); // don't test similarity if a previously recored
										    // similarity between clusters is less than this value

	PMCSQS_Scorer * pmcsqs = (PMCSQS_Scorer *)pmcsqs_ptr;

	BasicSpecReader bsr;
	const int num_spectra = cluster_fs.get_total_spectra();
	const vector<SingleSpectrumFile *>& all_ssf = cluster_fs.get_ssf_pointers();
	vector<BasicSpectrum> basic_spectra;
	int i;


	// set max_small_cluster_size
	if (max_small_cluster_size<0)
		max_small_cluster_size = 10 + (int)(0.8*log(all_ssf.size()));

//	cout << "MAX SAMLL CLUSTER SIZE: " << max_small_cluster_size << endl;


	// read all the basic spectra into a central spectra repository
	int total_peaks_read=0;
	mass_t min_m_over_z = 1E7;
	mass_t max_m_over_z = 0;
	basic_spectra.reserve(num_spectra);
	for (i=0; i<num_spectra; i++)
	{
		int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],
											 basic_peaks + total_peaks_read);
		BasicSpectrum bs;
		bs.num_peaks = num_spec_peaks;
		bs.peaks = basic_peaks + total_peaks_read;
		bs.ssf = all_ssf[i];

		if (pmcsqs)
		{
			vector<PmcSqsChargeRes> res;
			float prob = pmcsqs->get_pmcsqs_results_for_spectrum(config,bs,res);
			if (prob<MIN_SQS_PROB)
				continue; // this specturm was filtered!

			// update m/z and charge state (yes it is supposed to be const...)
			SingleSpectrumFile *ssf = bs.ssf;
			int max_charge=1;
			float max_prob=0;
			int c;
			for (c=0; c<res.size(); c++)
				if (res[c].prob1>max_prob)
				{
					max_prob=res[c].prob1;
					max_charge=c;
				}
			
			ssf->charge=max_charge;
			ssf->m_over_z = res[max_charge].mz1;
			ssf->sqs = prob;
		}
		

		basic_spectra.push_back(bs);

		total_peaks_read += num_spec_peaks;

		mass_t& m_over_z = bs.ssf->m_over_z;
		if (m_over_z<min_m_over_z)
			min_m_over_z = m_over_z;
		if (m_over_z>max_m_over_z)
			max_m_over_z = m_over_z;
	}

//	cout << "Read: " << setw(8) << min_m_over_z << " - " << setw(8) << max_m_over_z << " : " <<
//						setw(6) << basic_spectra.size() << " " << total_peaks_read << endl;


	// First stage, compare the basic spectra with clusters
	// Use high similarity threshold
	// If no cluster is found, create a new clusters for the spectrum
	// the calculated simlarities are stored is the sim matrix and can be used
	// in later stages to detect the need to re test the similarity

	const float first_stage_sim = similarity_vals[0];
	
	unsigned char * start_pos = sim_matrix;

	vector<int> idx_permutations;
	idx_permutations.resize(basic_spectra.size());
	for (i=0; i<basic_spectra.size(); i++)
		idx_permutations[i]=i;

	permute_vector(idx_permutations);

	for (i=0; i<basic_spectra.size(); i++)
	{
		const int spec_idx = idx_permutations[i];
		BasicSpectrum& spec = basic_spectra[spec_idx];
		const float spec_retention_time = spec.ssf->retention_time;

		vector<int> spec_top_idxs;
		float top_x_masses[NUM_TOP_CLUSTER_PEAKS];

		set_adjusted_inten(spec.peaks,spec.num_peaks);
		select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
			tolerance,spec_top_idxs, top_x_masses, ClusterSpectrum::get_num_top_peaks_per_1000_da());

	//	vector<CutProb> cp;
	//	calc_cut_prob_list(spec.peaks,spec.num_peaks,spec.ssf->org_pm_with_19,
	//		spec_top_idxs,cp);

	//	int k;
	//	for (k=0; k<cp.size(); k++)
	//		cout << cp[k].mass << " " << cp[k].prob << " , ";
	//	cout << endl;

		// compare to previous clusters
		
		int j;
		for (j=0; j<clusters.size(); j++)
		{ 
			// don't look at spectra that have to far away retention time
		/*	if (use_retention &&
				retention_window>=0 && 
				spec_retention_time>=0 && 
				clusters[j].get_retention_time()>=0 &&
				fabs(clusters[j].get_retention_time() - spec_retention_time)> retention_window)
			{
				if (use_remember)
					mark_bit_zero(start_pos,j);	
				continue;
			}*/

			if (use_top7_test && ! clusters[j].find_match_in_top_masses(top_x_masses))
			{
				if (use_remember)
					mark_bit_zero(start_pos,j);
				continue;
			}

			float sim = calc_selected_dot_prod(tolerance,
				spec.peaks,spec.num_peaks, spec_top_idxs,
				clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
				clusters[j].get_top_ranked_idxs(),verbose);

		//	float sim = calc_cut_prob_dot_prod(tolerance,cp,clusters[j].get_cut_prob_list());
		//	cout << "sim = " << sim << endl;

			if (use_remember)
			{
				if (sim >= min_similarity_thresh)
				{
					mark_bit_one(start_pos,j);
				}
				else
					mark_bit_zero(start_pos,j);
			}


			// add this spectrum to an existing cluster
			if (sim > first_stage_sim)
			{
				clusters[j].add_spectrum_to_cluster(spec);
				break;
			}
		}


		if (j<clusters.size())  // we added the spectrum to an existing cluster
			continue;

	
		// create new cluster from spectrum
		
		clusters.resize(clusters.size()+1);
		ClusterSpectrum& cs = clusters[clusters.size()-1];

		cs.create_new_cluster(config, spec, clusters.size()-1);
		cs.set_top_ranked_idxs(spec_top_idxs);
		cs.set_top_masses(top_x_masses);
		cs.set_sim_matrix_row_start(start_pos);



		// round off the start position to the next byte
		unsigned char *old = start_pos;
		start_pos += ((j+7) >> 3);
	}



	// second stage try joining clusters
	// first start with the joining larger clusters
	// use lower threshold

	int round;
	for (round=1; round<num_rounds; round++)
	{
		const float round_similarity = similarity_vals[round];
		const float tighter_similarity = 1.0 - (1.0 - similarity_vals[round])/2.0;

		// join larger clusters, use tighter similaarity
		for (i=clusters.size()-1; i>0; i--)
		{
			if (clusters[i].get_tmp_cluster_idx()<0)
				continue;

			unsigned char *sim_row_start = clusters[i].get_sim_matrix_row_start();
			const int num_spec_i = clusters[i].get_num_basic_spectra();

			int j;
			for (j=i-1; j>=0; j--)
			{
				if (clusters[j].get_tmp_cluster_idx()<0 ||
					clusters[j].get_num_basic_spectra() + num_spec_i <= max_small_cluster_size) 
					continue;

				// skip 32 places if the matirx is all zeros in that area
				if (use_remember && (j % 32 == 31) &&  ! get_matrix_32_bits(sim_row_start,j))
				{
					j-=32;
					continue;
				}

				if (use_remember && ! get_matrix_val(sim_row_start,j))
					continue;

					// don't look at spectra that have too far away retention time
			/*	if (use_retention &&
					retention_window>=0 && 
					clusters[i].get_retention_time()>=0 && 
					clusters[j].get_retention_time()>=0 &&
					fabs(clusters[j].get_retention_time() - clusters[i].get_retention_time())> retention_window)
				{
					if (use_remember)
						mark_bit_zero(sim_row_start,j);

					continue;
				} */

			
				float sim = calc_selected_dot_prod(tolerance,
					clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
					clusters[j].get_top_ranked_idxs(),
					clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(), 
					clusters[i].get_top_ranked_idxs());

			//	float sim = calc_cut_prob_dot_prod(tolerance,
			//		clusters[i].get_cut_prob_list(),clusters[j].get_cut_prob_list());


				if (use_remember)
				{
					if (sim >= min_similarity_thresh)
					{
						mark_bit_one(sim_row_start,j);
					}
					else
						mark_bit_zero(sim_row_start,j);
				}

				if (sim > tighter_similarity)
				{
					if (clusters[j].add_cluster(clusters[i],tighter_similarity))
					{
						clusters[i].set_tmp_cluster_idx(-1);
						break;
					}
				}
			}
		} 


		// join smaller clusters, use the round similarity
		for (i=clusters.size()-1; i>0; i--)
		{
			if (clusters[i].get_tmp_cluster_idx()<0 || 
				clusters[i].get_num_basic_spectra() > max_small_cluster_size)
				continue;

			unsigned char * sim_row_start = clusters[i].get_sim_matrix_row_start();
			
			const int num_spec_i = clusters[i].get_num_basic_spectra();
			int j;
			for (j=i-1; j>=0; j--)
			{
				if (clusters[j].get_tmp_cluster_idx()<0 ||
					num_spec_i + clusters[j].get_num_basic_spectra() > max_small_cluster_size)
					continue;

				// skip 32 places if the matirx is all zeros in that area
				if (use_remember && (j % 32 == 31) &&  ! get_matrix_32_bits(sim_row_start,j))
				{
					j-=32;
					continue;
				}

				if (use_remember && ! get_matrix_val(sim_row_start,j))
					continue;

					// don't look at spectra that have too far away retention time
			/*	if (use_retention &&
					retention_window>=0 && 
					clusters[i].get_retention_time()>=0 && 
					clusters[j].get_retention_time()>=0 &&
					fabs(clusters[j].get_retention_time() - clusters[i].get_retention_time())> retention_window)
				{
					if (use_remember)
						mark_bit_zero(sim_row_start,j);

					continue;
				} */


				float sim = calc_selected_dot_prod(tolerance,
					clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
					clusters[j].get_top_ranked_idxs(),
					clusters[i].get_peaks_pointer(),clusters[i].get_num_peaks(), 
					clusters[i].get_top_ranked_idxs());

			//	float sim = calc_cut_prob_dot_prod(tolerance,
			//		clusters[i].get_cut_prob_list(),clusters[j].get_cut_prob_list());

		

				if (use_remember)
				{
					if (sim >= min_similarity_thresh)
					{
						mark_bit_one(sim_row_start,j);
					}
					else
						mark_bit_zero(sim_row_start,j);
				}



				if (sim > round_similarity)
				{
					if (clusters[j].add_cluster(clusters[i],round_similarity))
					{
						clusters[i].set_tmp_cluster_idx(-1);
						break;
					}
				}
			}
		}
	} 


	if (verbose)
	{
		
	//	for (i=0; i<clusters.size(); i++)
	//		if (clusters[i].get_basic_spectra().size()>1)
	//			clusters[i].print_cluster_alignment(config,tolerance);

		for (i=0; i<clusters.size(); i++)
			if (clusters[i].get_tmp_cluster_idx() >=0 &&
				clusters[i].get_basic_spectra().size() == 1 && 
				clusters[i].get_basic_spectra()[0].ssf->peptide.get_num_aas()>0)
				cout << clusters[i].get_basic_spectra()[0].ssf->peptide.as_string(config) << endl;
	}


	
	return num_spectra;
}



/**********************************************************************
	Adds spectra from the additional set to the existing clusters.
	If they are added they are invalidated from further clusetering.
***********************************************************************/
int add_additional_spectra_to_existing_clusters(
							Config *config, 
							const FileManager& fm, 
							FileSet& additional_fs, 
							mass_t tolerance, 
							QCPeak *basic_peaks, 
							vector<ClusterSpectrum>& clusters, 
							float min_similarity, 
							bool verbose)
{
	float spectrum_join_similarity = 0.875;
	if (min_similarity>spectrum_join_similarity)
		spectrum_join_similarity = min_similarity;

	// read spectra
	BasicSpecReader bsr;
	const int num_spectra = additional_fs.get_total_spectra();
	const vector<SingleSpectrumFile *>& all_ssf = additional_fs.get_ssf_pointers();
	
	vector<BasicSpectrum> basic_spectra;
	basic_spectra.reserve(num_spectra);

	int i,total_peaks_read=0;

	mass_t min_m_over_z = 1E7;
	mass_t max_m_over_z = 0;

	for (i=0; i<num_spectra; i++)
	{
		if (all_ssf[i]->assigned_cluster>=0)
			continue;

		int num_spec_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],
												 basic_peaks + total_peaks_read);

		BasicSpectrum bs;
		bs.num_peaks = num_spec_peaks;
		bs.peaks = basic_peaks + total_peaks_read;
		bs.ssf = all_ssf[i];

		basic_spectra.push_back(bs);

		total_peaks_read += num_spec_peaks;

		mass_t& m_over_z = bs.ssf->m_over_z;
		if (m_over_z<min_m_over_z)
			min_m_over_z = m_over_z;
		if (m_over_z>max_m_over_z)
			max_m_over_z = m_over_z;
	}

//	cout << "add:  " << setw(8) << min_m_over_z << " - " << setw(8) << max_m_over_z << " : " <<
//						setw(6) << basic_spectra.size() << " " << total_peaks_read << endl;

	vector<int> idx_permutations;
	idx_permutations.resize(basic_spectra.size());
	for (i=0; i<basic_spectra.size(); i++)
		idx_permutations[i]=i;

	permute_vector(idx_permutations);

	// cluster the spectra
	int num_added=0;
	for (i=0; i<basic_spectra.size(); i++)
	{
		const int spec_idx = idx_permutations[i];
		BasicSpectrum& spec = basic_spectra[spec_idx];
		const float spec_retention_time = spec.ssf->retention_time;
		float top_x_masses[NUM_TOP_CLUSTER_PEAKS];
		vector<int> spec_top_idxs;

		set_adjusted_inten(spec.peaks,spec.num_peaks);
		select_top_peak_idxs(spec.peaks,spec.num_peaks,spec.ssf->m_over_z,
			tolerance,spec_top_idxs, top_x_masses, ClusterSpectrum::get_num_top_peaks_per_1000_da());

		// compare to previous clusters
		int j;
		for (j=0; j<clusters.size(); j++)
		{
			if (clusters[j].get_tmp_cluster_idx() < 0)
				continue;

			if (! clusters[j].find_match_in_top_masses(top_x_masses))
				continue;


			float sim = calc_selected_dot_prod(tolerance,
				spec.peaks,spec.num_peaks, spec_top_idxs,
				clusters[j].get_peaks_pointer(),clusters[j].get_num_peaks(), 
				clusters[j].get_top_ranked_idxs());

			if (sim > spectrum_join_similarity)
			{
				clusters[j].add_spectrum_to_cluster(spec);
				num_added++;
				break;
			}
		}
	}

	return num_added;
}

// makes the consensus cluster from the supplied spectra
void make_single_consensus_from_mgf(char *mgf_file, Config *config)
{
	BasicSpecReader bsr;
	ClusterSpectrum cs;
	QCPeak *basic_peaks=NULL;
	vector<BasicSpectrum> basic_spectra;
	
	
	if (! read_mgf_file_into_basic_spectra(config,mgf_file,basic_peaks,basic_spectra))
	{
		cout << "Error reading MGF file!" << endl;
		exit(1);
	}

	int already_assigned=0;

	if (basic_spectra.size() == 0)
	{
		cout <<endl << "I couldn't find any spectra in your input file!" << endl;
		cout << "You wake me up and make me run for this?... Give me a break..." << endl;
		cout << "How am I supposed to create a consensus spectrum if there are NO SPECTRA TO READ?!" << endl;
		exit(1);
	}


/*	int i;
	for (i=0; i<basic_spectra.size(); i++)
	{
		basic_spectra[i].print_peaks();
		cout << endl;
	} */

	cs.set_config(config);
	cs.set_basic_spectra(basic_spectra);
	cs.set_charge();
	cs.set_cluster_m_over_z();
	cs.set_title(mgf_file);


	cs.create_cluster_by_binning_basic_spectra();
	cs.write_spectrum_to_mgf(cout);
}


/*
void qc_exp()
{
	RegularRankModel model;
	Config *config;
	FileManager fm;
	FileSet fs;

	clock_t start_t,end_t;
	start_t = clock();
	
	model.read_model("LTQ_LOW_TRYP");
	config = model.get_config();
	config->apply_selected_PTMs("C+57 M+16");
	config->set_tolerances(0.5);
	config->set_pm_tolerance(2.5);
	config->set_max_number_peaks_per_local_window(15);

	config->set_need_to_normalize(1);

//	make_single_consensus_from_mgf("C:\\Work\\ClusterAnn\\examples\\xxx.mgf",config);

//	benchmark_signal(&model,"C:\\Work\\clust_exp\\BM100.mgf",100);

//	benchmark_signal_to_noise(&model,"C:\\Work\\clust_exp\\BM100.mgf",100);
	benchmark_clustering_performance(config,"C:\\Work\\clust_exp\\ShewMGF\\BM2000_ann_list.txt",
		15);

//	benchmark_signal_to_noise(&model,"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann100.mgf",100);
//	benchmark_large_clusters(&model,"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann100.mgf",100);
//	create_spectra_mgf_file(&model,"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann100.mgf",100);
//	create_file_with_rt_scores(&model,"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann100.mgf",100);
	
	exit(0);

	make_annotated_mgf_dataset_from_dat(config, 
		"C:\\Work\\clust_exp\\lists\\40ul_list.txt",
		"C:\\Work\\clust_exp\\tmp\\H293_40ul_list.txt",
		"C:\\Work\\clust_exp\\BM_results\\sing_short.txt",
		"C:\\Work\\clust_exp\\ann_mgf", "Sings");

	exit(0);

	ann_mzXML_and_create_mgf(config, "C:\\Work\\Data\\Briggs\\Annotations\\anns99.txt", 
							 "C:\\Work\\Data\\Briggs\\Annotations\\200ug_081905.txt",
							 "C:\\Work\\Data\\Briggs\\Annotations\\ann99\\",
							 "ann99",true);

	exit(0);
//	ClusterSpectrum::init_statics(config);

//	create_spectrum_clusters(config,"C:\\Work\\msms5\\lists\\anns99rts.txt",
//			"clust_out","anns99",0,5E6,0,1E6,1000);

//	create_spectrum_clusters(config,"C:\\Work\\msms5\\lists\\LC1.txt",
//			"clust_out","MG",0,5E6,0,1E6,1250);

	benchmark_similarity_to_consensus(config,
		"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann5.mgf",5);

//	benchmark_inter_similarity_vs_outer_similarity(config,
//		"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann5.mgf",5);
//	benchmark_similarity_measures(config,"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann5.mgf",5);



	exit(0);

	make_annotated_mgf_dataset_from_dat(config, 
		"C:\\Work\\clust_exp\\lists\\CoCl2_nc_list.txt",
		"C:\\Work\\clust_exp\\tmp\\CoCl2_dat_list.txt",
		"C:\\Work\\clust_exp\\Results\\CoCl2_nc\\CoCl2_ann100.txt",
		"C:\\Work\\clust_exp\\ann_mgf", "CoCl2ann100");


	exit(0);

//	vector<ClusterSpectrum> clusters;
//	read_annotated_dataset_into_clusters(config,
//		"C:\\Work\\clust_exp\\ann_mgf\\CoCl2_ann5.mgf",5,clusters);

	ann_mzXML_and_create_mgf(config, "C:\\Work\\clust_exp\\LTQ_anns1.txt", 
							 "C:\\Work\\clust_exp\\lists\\CoCl2_list.txt",
							 "C:\\Work\\clust_exp\\",
							 "LTQ_train1",true);

	ann_mzXML_and_create_mgf(config, "C:\\Work\\clust_exp\\LTQ_anns2.txt", 
							 "C:\\Work\\clust_exp\\lists\\CoCl2_list.txt",
							 "C:\\Work\\clust_exp\\",
							 "LTQ_train2",true);

	ann_mzXML_and_create_mgf(config, "C:\\Work\\clust_exp\\LTQ_anns3.txt", 
							 "C:\\Work\\clust_exp\\lists\\CoCl2_list.txt",
							 "C:\\Work\\clust_exp\\",
							 "LTQ_train3",true);

//	ann_mzXML_and_create_mgf(config, "C:\\Work\\clust_exp\\ver\\27211missed_anns.txt", 
//							 "C:\\Work\\clust_exp\\lists\\CoCl2_60_clust_list.txt",
//						//	 "C:\\Work\\msms5\\lists\\one_mzxml.txt",
//							 "C:\\Work\\clust_exp\\ver",
//							 "27211missed",true);

	exit(0);

	make_annotated_mgf_dataset_from_dat(config, 
		"C:\\Work\\clust_exp\\lists\\CoCl2_nc_list.txt",
		"C:\\Work\\clust_exp\\tmp\\CoCl2_dat_list.txt",
		"C:\\Work\\clust_exp\\Results\\CoCl2_nc\\CoCl2_nc5_anns.txt",
		"C:\\Work\\clust_exp\\ann_mgf", "CoCl2ann5");
	make_annotated_mgf_dataset_from_dat(config, 
		"C:\\Work\\clust_exp\\lists\\40ul_list.txt",
		"C:\\Work\\clust_exp\\tmp\\40ul_dat_list.txt",
		"C:\\Work\\clust_exp\\Results\\H293_40ul_no_clust\\H293_40ul_no_clust_anns.txt",
		"C:\\Work\\clust_exp\\ann_mgf", "40ul_ann");

	exit(0);


	exit(0);

//	fm.init_from_list_file(config,"C:\\Work\\msms5\\lists\\aa-LTQ1.txt");

	DAT_Converter dat;
	dat.init_DAT_Converter(2000,20,262144);

	dat.convert_MZXML_to_DAT_on_the_fly(config, "C:\\Work\\msms5\\lists\\aa-LTQ1.txt",
		"C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\","NLC");

//	dat.convert_MZXML_to_DAT(config, "C:\\Work\\msms5\\lists\\aa-LTQ1.txt",
//		"C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\","NLD");

	end_t = clock();
	double total_time = (end_t - start_t)/(double)CLOCKS_PER_SEC;
	cout << "Total time: " << total_time << endl;
	return;
	


	






//	ann_mzXML_and_create_mgf(config, "C:\\Work\\Data\\Briggs\\Cluster\\short_idx.txt", 
//							 "C:\\Work\\msms5\\lists\\short_mzXML.txt",
//							 "C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\",
//							  "LargeClusters",true);
	 

//	ann_mzXML_and_create_mgf(config, "C:\\Work\\Data\\Briggs\\Cluster\\anns_idx.txt", 
//							 "C:\\Work\\msms5\\lists\\all_mzXML.txt",
//							 "C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\",
//							  "LC",true);
	 

//	ann_mzXML_and_create_mgf(config, "C:\\Work\\Data\\Briggs\\Annotations\\anns99.txt", 
//							 "C:\\Work\\Data\\Briggs\\Annotations\\anns99_mzXML_list.txt",
//							 "C:\\Work\\Data\\Briggs\\Annotations\\only_anns99\\",
//							  "only_anns99",true);

//	return;
//	DAT_Converter dat;
//	dat.init_DAT_Converter(2000,20,262144);

//	dat.convert_MZXML_to_DAT_on_the_fly(config, "C:\\Work\\msms5\\lists\\all_mzXML.txt",
//		"C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\","NLC");


//	dat.convert_MZXML_to_DAT_only_annotated(config, "C:\\Work\\msms5\\lists\\all_mzXML.txt",
//				"C:\\Work\\Data\\Briggs\\Cluster\\anns_idx.txt","C:\\Work\\Data\\Briggs\\Cluster\\LargeClusters\\",
//				"LC");


		
//	dat.convert_MZXML_to_DAT(config,"C:\\Work\\msms5\\lists\\H293b-total-try-2nd-digest-abd.txt",
//			"C:\\Work\\msms5\\PepNovoHQ\\clust_out2","h293a_dat");

//	create_spectrum_clusters(config,"C:\\Work\\msms5\\PepNovoHQ\\clust_out2\\h293a_dat_list.txt","clust_out","h29s",0,5E6,0,1E6);

}


*/




