#ifndef __QUICKCLUSTERING_H__
#define __QUICKCLUSTERING_H__

#include "FileManagement.h"
#include "includes.h"


#define XML_BUFFER_SIZE 102400
#define XML_BUFFER_HALF_SIZE 51200

#define LARGISH_CLUSTER_SIZE 6
#define NUM_CLUSTERS_PER_FILE 10000
#define DAT_BUFF_SIZE 10000000
#define DAT_FILE_INCREMENT 25.0


#define NUM_TOP_CLUSTER_PEAKS 5


// values at which the cluster should be remade
const int cluster_reset_values[]={2,3,4,6,9,15,30,50,100,200};
const int num_cluster_reset_values = sizeof(cluster_reset_values)/sizeof(int);


struct MassRankPair {
	
	bool operator< (const MassRankPair& other) const
	{
		return (rank<other.rank);
	}

	mass_t mass;
	int    rank;
};


struct QCPeak {
	QCPeak() : mass(-1), intensity(0), scaled_intensity(0), adjusted_inten(0),
			   num_occurences(1), max_num_occurences(0), source_spec_idx(-1) {};

	mass_t mass;
	intensity_t intensity;
	intensity_t scaled_intensity;
	float   adjusted_inten;  // holds the adjusted value of the peak intensity for
	int	    num_occurences;
	int		max_num_occurences;
	int	    source_spec_idx;
};


struct BasicSpectrum {
public:
	BasicSpectrum() : peaks(NULL), prm_peaks(NULL), num_peaks(-1), num_prm_peaks(-1),
		ssf(NULL), squared_adjusted_inten(-1),	signal_level(0) {};

	void output_to_mgf(ostream& mgf, Config *config, const char *seq=NULL) const;

	// returns number of peaks that match the given masses
	int get_number_of_matching_peaks(mass_t tolerance, const vector<mass_t>& masses) const;

	float calc_signal_level();

	void  calc_peak_isotope_levels(mass_t tolerance, vector<float>& iso_levels) const;

	void  select_strong_peak_idxs(const vector<float>& iso_levels, vector<bool>& indicators) const;
	
	void print_peaks() const;

	QCPeak *peaks;
	QCPeak *prm_peaks;

	int     num_peaks;
	int		num_prm_peaks;
	SingleSpectrumFile *ssf;
	float   squared_adjusted_inten;
	float	signal_level;
};





// Similar to FileSet but has minimum overhead and works with BasicSpectra
class BasicSpecReader {
public:

	BasicSpecReader() : max_peak_list_size(1000), 
						mgf_stream(NULL), current_mgf_file_idx(-1),
						mzxml_stream(NULL), current_mzxml_file_idx(-1),
						current_dat_file_idx(-1), current_ms2_file_idx(-1) {};

	~BasicSpecReader() { if (mgf_stream) fclose(mgf_stream);
						 if (mzxml_stream) fclose(mzxml_stream);}
						 

	// Reads the important info from the single spectrum
	// pretty much does what the get_next_spectrum() does with the FileSet
	// but without much of the overhead. Returns number of peaks read (after
	// joining close adjacent peaks)
	int read_basic_spec(Config *config, const FileManager& fm, 
						SingleSpectrumFile *ssf, QCPeak* peaks,
						bool override_file_idx = false);

private:

	int max_peak_list_size;

	FILE *mgf_stream;          // the current MGF file being scanned (its open stream)
	int current_mgf_file_idx;  // the file index of the current mgf that is open

	FILE *mzxml_stream;          // the current MZXML file being scanned (its open stream)
	int current_mzxml_file_idx;  // the file index of the current MZXML that is open

	ifstream dat_file;
	int current_dat_file_idx;

	FILE *ms2_stream;
	int current_ms2_file_idx;

	vector<QCPeak> peak_list;      // used for temporary storage of a spectrum's peak list

	// these functions just extract the peak list from the spectrum file, return the actual
	// number of peaks (after joining)
	int get_peak_list_from_DTA(const char* dta_name);
	int get_peak_list_from_MGF(FILE *mgf_stream);
	int get_peak_list_from_MZXML(FILE *mzxml_stream);
	int get_peak_list_from_DAT(ifstream& dat_file, QCPeak *peaks);
	int get_peak_list_from_MS2(FILE *ms2_stream);
};



struct PeakListPointer {
	QCPeak *peaks;
	int num_peaks;
};

struct CutProb {
	bool operator< (const CutProb& other) const
	{
		return (mass < other.mass);
	}

	float mass;
	float prob;
};


void mark_top_peaks_with_sliding_window(const QCPeak *peaks, int num_peaks, mass_t window_size, 
					    int num_peaks_per_window, vector<bool>& indicators);

/////////////////////////////////////////////////////////////////////////////
// For mzXML parsing
struct MassInten {
	float mass, intensity;
};


int join_and_filter_peak_list(Config *config, mass_t m_over_z, 
							  float *org_peaks, int num_org_peaks, 
							  float *new_peaks);
/////////////////////////////////////////////////////////////////////////////


/************************************************************
// Functions for the dot product similarity distance
*************************************************************/
void select_top_peak_idxs(QCPeak *peaks, int num_peaks, 
						  mass_t m_over_z, mass_t tolerance, 
						  vector<int>& top_ranked_peak_idxs,  
						  float top_x_masses[NUM_TOP_CLUSTER_PEAKS]=NULL,
						  int top_peaks_per_100da = 20);


// sets the adjusted intensity of the peaks
// uses  1/1+log(rank)
void set_adjusted_inten(QCPeak *peaks, int num_peaks);

// Sets the adusted intensity of the peaks that are in top_ranked_idxs
// the spectrum is reduced only to these peaks, so the intensity given to each peak
// is the ration I/I_total where I_total is for all the peaks in the top_ranked_idxs
void set_top_ranked_peak_inten(QCPeak *peaks, int num_peaks, const vector<int>& top_ranked_idxs, 
							   vector<float>& top_ranked_peak_inten);


float calc_sum_adjusted_inten_squared(const QCPeak *peaks, int num_peaks);

float calc_sum_adjusted_inten_squared(const QCPeak *peaks, int num_peaks, 
									  const vector<int>& top_ranked_peak_idxs);




float calc_selected_dot_prod(mass_t tolerance, 
							 const QCPeak *pa, int na, const vector<int>& peak_idxs_a,
	  					     const QCPeak* pb, int nb, const vector<int>& peak_idxs_b,
							 bool verbose = false);


void collect_dot_product_stats(char *list_file);


void dot_prod_exp();







class DAT_FileBuff {
	friend class DAT_Converter; 
public:

	DAT_FileBuff()  : buff(NULL), max_pos(NULL), pos(NULL), 
					  ind_first_write(1), ind_was_initialized(0) {};
	~DAT_FileBuff();

	void init(string& _path, int buff_size); // initializes ans allocates buffer memory

	// copies the files to the DAT file
	void add_spec_to_DAT_file(mass_t m_over_z, int charge, int mzxml_file_idx,
							  int scan_number, float retention_time, 
							  float precursor_intensity, int num_peaks, char *peak_buff);

	void flush_buff(); // writes buff to file
private:

	string path;
	char *buff;
	char *max_pos;
	char *pos;

	int  ind_first_write;      // is this the first write (if not, append)
	int  ind_was_initialized;  
};


class DAT_Converter {
public:

	DAT_Converter() : max_m_over_z((mass_t)2000.0), mass_increment((mass_t)DAT_FILE_INCREMENT),
		dat_buff_size(DAT_BUFF_SIZE), max_dat_file_idx(-1), ind_was_initialized(0),
		batch(0) {};

	void init_DAT_Converter(mass_t _max_m_over_z, mass_t _mass_increment, 
			  int dat_buff_size);

	void DAT_Converter::convert_files_to_DAT_on_the_fly(Config* config, char *file_list, 
							char * _out_dir, char * _name, int _batch, 
							mass_t min_m_over_z, mass_t max_m_over_z, int file_start_idx);

	void convert_MZXML_to_DAT(Config *config, char *file_list, 
							  char * _out_dir, char * _name, int _batch=0,
							  mass_t min_m_over_z =0.0, mass_t max_m_over_z = 10000.0,
							  int file_start_idx =0 );


	void convert_MZXML_to_DAT_on_the_fly(Config *config, char *file_list, 
							  char * _out_dir, char * _name, int _batch=0,
							  mass_t min_m_over_z =0.0, mass_t max_m_over_z = 10000.0,
							  int file_idx_start = 0);

	int convert_single_non_MZXML_file_to_DAT(Config* config, string file, 
							mass_t min_m_over_z, mass_t max_m_over_z, int file_idx);

	void convert_spectra_to_DAT(Config* config, char *file_list, 
							char * _out_dir, char * _name, int _batch, 
							mass_t min_m_over_z, mass_t max_m_over_z,
							int file_idx_start = 0);


	// creates a file with the list of DAT files
	void create_list_file() const;

private:
	string out_dir;
	string name;

	mass_t max_m_over_z;
	mass_t mass_increment;
	int    dat_buff_size;
	int    max_dat_file_idx;
	int    batch;

	int    ind_was_initialized;

	vector<DAT_FileBuff> dat_buffs;

	int parse_single_MZXML_file(Config *config, string& mzxml_name, int file_idx,
								mass_t min_m_over_z, mass_t max_m_over_z);
};











#endif

