

#ifndef __FASTA_DB_H__
#define __FASTA_DB_H__

#include "../../util.h"
#include "Config.h"
#include "includes.h"
#include "BasicDataStructs_add.h"
#include "AnnotatedSpectrum_add.h"
#include "AllScoreModels.h"
#include "../../simplesuffixtree/src/SuffixTree.h"
#include "stdio.h"
#include <list>

using namespace std;


bool ComparePepLocBySeqSimple(const PeptideLocation& pl_1, const PeptideLocation& pl_2);
bool ComparePeptidePepLocBySeqSimple(const PeptideAdd& pep, const PeptideLocation& pl);
bool ComparePeptidePepLocBySeqEqSimple(const PeptideAdd& pep, const PeptideLocation& pl);

class FastaDB;

class ModifType {
	public:
		int pos;
		float mass_offset;
		bool direction;
};


typedef vector<ModifType> ModifList;

class SpectrumMethod {
	public:
		int scan_index;
		int spectrum_index;
		int method_index;
		float parent_mass;
		float charge;
	
};

class PeptideLocation {
  public:
	int aa_loc; // in the whole sequence database
	int protein_idx;
	int loc_in_protein;
  int length;
  FastaDB* fdb;
  void print();
};


typedef vector<AnnotatedSpectrum> AnnotatedSpectrumList;



class PeptideCounter: public PeptideAdd {
	public:
		void set_scan_list_index(int sli) { scan_index = sli;}
		void set_mass_error(float me) { mass_error = me;}
		void set_counter(int num) { counter = num;}
		int get_counter() const {return counter;}
		void set_protein_index(int pidx) {protein_idx = pidx;}
		void set_position(int p) {position = p;}
		void set_counter_length(int len) {length = len;}
		int get_counter_length() const {return length;}
		void set_loaded() {is_loaded = true;}
		void set_not_loaded() {is_loaded = false;}
		bool get_loaded() {return is_loaded;}
		void apply_mod(ModifType mod);
		void apply_mod_list(ModifList mod_list) {
			for(int ind = 0; ind < mod_list.size(); ind++)
				apply_mod(mod_list[ind]);
		}
    float pval;
		int counter;
		int num_cys;
		float pepnovo_score;
		float inspect_score;
		int protein_idx;
    int seq_index;
		int spectrum_ind;
		int scan_index;
		int position;
		int length;
    int position_1, position_2;
    int length_1, length_2;
    int protein_index_1, protein_index_2;
		int score;
		int scoring_method;
		int scoring_method_index;
		float norm_score;
		float mass_error;
		bool is_loaded;
		FastaDB *fdb;
		void set_fasta_db(FastaDB *input_fdb) {fdb = input_fdb;}
		void load_peptide(int comp_init = 1);
		void load_peptide(Peptide& pep, int comp_init = 1) const;
		void set_scoring(int scr_met) {scoring_method = scr_met;}
		void set_scoring_index(int scr_met_ind) {scoring_method_index = scr_met_ind;}
		void set_spectrum_index(int spec_ind) {spectrum_ind = spec_ind;}
		void load_score(AnnotatedSpectrumAdd& spectrum, vector<float> acc_thresh, Offset offset, int score_type);
		int get_scoring() {return scoring_method;}
};

bool ComparePeptideCounterByMass(const PeptideCounter& pep_1, const PeptideCounter& pep_2);
bool ComparePeptideCounterByAmino(const PeptideCounter& pep_1, const PeptideCounter& pep_2);
bool ComparePeptideCounterByCounter(const PeptideCounter& pep_1, const PeptideCounter& pep_2);
bool ComparePeptideCounterByPepNovoScore(const PeptideCounter& pep_1, const PeptideCounter& pep_2);

class ScoreStatistics {
	public:
		int min_req;
		int num_scoring_methods;
		int num_spectra;
		int min_length;
		int max_length;
		bool ***flag;
		int ***num_pep;
		int ***sum_score;
		int ***sum_square_score;
		float ***mean;
		float ***std;		
		void set_num_scoring_methods(int num_scr_met) {num_scoring_methods = num_scr_met;}
		void set_num_spectra(int num_spec) {num_spectra = num_spec;} 
		void set_max_length(int max_len) {max_length = max_len;} 
		void set_min_length(int min_len) {min_length = min_len;} 
		void insert_psm(PeptideCounter& pep);
		void finalize();
		void print(int spectrum_index);
		void normalize_score(PeptideCounter& pep);
		ScoreStatistics() : min_req(10) {};
		void initialize(){
			// cout << "num spectra : " << num_spectra << ", num scr methods : " << num_scoring_methods << endl;
			flag = new bool **[max_length];
			num_pep = new int **[max_length];
			sum_score = new int **[max_length];
			sum_square_score = new int **[max_length];
			mean = new float **[max_length];
			std = new float **[max_length];
			for(int len = 0; len < max_length; len++) {
				flag[len] = new bool*[num_scoring_methods];
				num_pep[len] = new int*[num_scoring_methods];
				sum_score[len] = new int*[num_scoring_methods];
				sum_square_score[len] = new int*[num_scoring_methods];
				mean[len] = new float*[num_scoring_methods];
				std[len] = new float*[num_scoring_methods];
				for(int scr_met_ind = 0; scr_met_ind < num_scoring_methods; scr_met_ind++) {
					flag[len][scr_met_ind] = new bool[num_spectra];
					num_pep[len][scr_met_ind] = new int[num_spectra];
					sum_score[len][scr_met_ind] = new int[num_spectra];
					sum_square_score[len][scr_met_ind] = new int[num_spectra];
					mean[len][scr_met_ind] = new float[num_spectra];
					std[len][scr_met_ind] = new float[num_spectra];
					for(int scan_ind = 0; scan_ind < num_spectra; scan_ind++) {
						flag[len][scr_met_ind][scan_ind] = false;
						num_pep[len][scr_met_ind][scan_ind] = 0;
						sum_score[len][scr_met_ind][scan_ind] = 0;
						sum_square_score[len][scr_met_ind][scan_ind] = 0;
						mean[len][scr_met_ind][scan_ind] = 0.0;
						std[len][scr_met_ind][scan_ind] = 0.0;
					}
				}
			}
		}
};


class PeptideSpectrumHits : public list<PeptideCounter> {
	public:
		int spectrum_ind;
		int scan_ind;
		float min_norm_score;
		float max_norm_score;
		float max_pepnovo_score;
    void set_spectrum_ind(int si) {spectrum_ind = si;}
    void score(AnnotatedSpectrumCounter& spectrum, vector<float> acc_thresh, Offset offset, int scoring_method, int score_type);
};


bool ComparePeptideSpectrumHitsByMaxNormScore(const PeptideSpectrumHits& psh_1, const PeptideSpectrumHits& psh_2);
bool ComparePeptideSpectrumHitsByMaxPepNovoScore(const PeptideSpectrumHits& psh_1, const PeptideSpectrumHits& psh_2);

class PeptideCounterList {
public:
	PeptideCounterList() : num_top_hits(10), min_mass_filter_val(0.0), max_mass_filter_val(10000.0), min_length_filter_val(0), max_length_filter_val(100), print_flag(false) {
		mod_list.clear();
	};
	ModifList mod_list;
	ScoreStatistics statistics;
	vector<PeptideSpectrumHits> top_hits;
	vector<int> scan_list;
  	AllScoreModels model;
	void set_model(AllScoreModels& mdl) {model = mdl;}
  	// ofstream log_file;
	int num_top_hits;
	FastaDB *fdb;
	bool print_flag;
	float min_mass_filter_val;
	float max_mass_filter_val;
	int min_length_filter_val;
	int max_length_filter_val;
	int num_spectra;
	// int SearchPeptide(Peptide& pep);
	// int SearchPeptideTmp(int first, int last, Peptide& pep);
	vector<int> scoring_methods;
	vector<float> offsets;
  void score(vector<AnnotatedSpectrumCounter>& spectrum_vector, vector<float>acc_thresh, Offset offset, int scoring_method, int score_type);
	void set_fasta_db(FastaDB *input_fdb) {fdb = input_fdb;}
	void apply_max_mass_filter(int l) {max_mass_filter_val = l;}
	void apply_min_mass_filter(float m) {min_mass_filter_val = m;}
	void apply_max_length_filter(int l) {max_length_filter_val = l;}
	void apply_min_length_filter(int l) {min_length_filter_val = l;}
	void set_scan_list(vector<int> sl) { scan_list = sl;}
	void insert_psm_norm_score(PeptideCounter& pep);
	void sort_by_pepnovo();
	void set_minimum_required_matchs(int min_req) {statistics.min_req = min_req;}
	void score_pepnovo(vector<AnnotatedSpectrumAdd>& spectrum_vector, string log_file);
	void sort_pepnovo();
	void add_fixed_mod(int pos, float mass_offset, bool is_forward);
  	void set_num_top_hits(int nth) {num_top_hits = nth;}
	void set_num_spectra(int num_spec) {
		num_spectra = num_spec;
		top_hits.resize(num_spectra);
		for(int ind = 0; ind<num_spectra; ind++) {
			top_hits[ind].clear();
			top_hits[ind].min_norm_score = 0;
			top_hits[ind].max_norm_score = 0;
			top_hits[ind].max_pepnovo_score = 0;
			top_hits[ind].scan_ind = ind;
			top_hits[ind].spectrum_ind = scan_list[ind];
		}
	}
	void do_print() {print_flag = true;}
};


class SearchResults {
public:
    SearchResults() : max_num_spectra(-1), max_num_top_hits(-1), native_information_flag(false){  
      enforce_cys_flag = false;
      enforce_mass_flag = false;
      pval_flag = false;
    };
    Config* config;
    bool pval_flag;
    bool enforce_cys_flag;
    bool enforce_mass_flag;
    float syn_thresh;
    int syn_num_peptide;
    SpecPairListList spec_pair_list_list;
    vector<string> protein_names;
    int num_cys;
    bool native_information_flag;
    int num_spectra;
    int max_num_spectra;
    int max_num_top_hits;
    vector<int> scan_list;
    vector<int> num_hits;
    vector<int> spectrum_index;
    vector<int> spectrum_charge;
    vector<int> spectrum_peak_num;
    vector<float> spectrum_mass;
    vector< vector<float> > peptide_mass;
    vector< vector<string> > all_peptide_str;
    vector< vector<int> > all_protein_indices;
    vector< vector<int> > all_protein_positions;
    vector< vector<int> > all_protein_lengths;
    vector< vector <int> > score;
    vector< vector <float> > pval_vector;
    vector< vector <float> > norm_score;
    vector< vector <float> > mass_error;
    vector< vector <int> > all_structs;
    vector<int> spectrum_index_native;
    vector<float> spectrum_mass_native;
    vector<float> spectrum_charge_native;
    vector<int> spectrum_peak_num_native;
    vector<int> total_shared_peaks;
    vector<int> zero_shared_peaks;
    vector<int> delta_shared_peaks;
    
    void enforce_cys() {enforce_cys_flag = true;}
    void enforce_mass(float m) {syn_thresh = m; enforce_mass_flag = true;}
    void set_config(Config* conf) {config = conf;}
    void set_num_peptide(int n) {syn_num_peptide = n;}
    void calculate_pval(vector<AnnotatedSpectrumCounter>& spectrum_vector, PeptideCounterList& pep_list, vector<float> acc_thresh, Offset offset, int score_type);
    void set_protein_names(vector<string> pt) {protein_names = pt;}
    void print_html(string out_file_string);
    void set_scan_list(vector<int> sl) {scan_list = sl;}
    void parse_from_pep_list(PeptideCounterList& pep_list, AnnotatedSpectrumCounterList& spectrum_vector);
    void print_data(string res_file_string);
    void print(string out_file_string);
    void print_xls(string out_file_string);
    void set_max_num_spectra(int mns) {max_num_spectra = mns;}
    void set_max_num_top_hits(int mnth) {max_num_top_hits = mnth;}
    void parse_from_file(string in_file_string);
    void add_native_information(SpecPairListList& spll, int nc) {native_information_flag = true; spec_pair_list_list = spll; num_cys = nc;}
};

struct TagListPointer {
	TagListPointer() : num_locations(0), list_start_idx(-1) {};
	int num_locations;
	int list_start_idx;
};

typedef map< int , TagListPointer, less<int> > INT2TLP_MAP;

class PeptideSearchRes {
  public:
  int protein_index;
  int pos_index;
  int seq_index;
  bool direction;
  int length;
};

class Tag {
  public:
  PeptideAdd pep;
  int length;
  vector<PeptideSearchRes> psr_vec;
  void print();
};

class TagVector : public vector<Tag> {
  public:
  void print();
};

class TagVectorVector : public vector<TagVector> {
  public:
  int max_tag_len;
  int min_tag_len;
  void save(string out_file);
  void load(Config* config, string in_file);
  void print();
};

class FastaDB {
public:

	FastaDB();
	~FastaDB();

  bool load_pep_loc(PeptideLocation pl, PeptideAdd& pep) const;
	// creates all relevant data structures from the fasta file
	// includes the sequences (stored as aa - ints), protein names,
	// and tag hashes
  void set_min_tag_length(int mtl) {min_tag_len = mtl;}
  void set_max_tag_length(int mtl) {max_tag_len = mtl;}
  void update_tag_vectors();
  void update_tag_vectors(int len);
  bool match_pep(int pi, int pos, Peptide& pep, bool direction);
  bool find_mass_match(PeptideSearchRes& psr_1, PeptideSearchRes psr_2, float in_mass, float thresh, PeptideSpectrumHits& psh, int spectrum_ind = 0);
  int search_db(Peptide& peptide, vector<PeptideSearchRes>& search_res);

	void create_db_from_fasta(char *file, Config *con,
	bool create_tags = true, int min_length=3, int max_length=6);

	int get_num_cands_with_mass(float mass, float tolerance) const;

	int get_total_seq_length() const { return all_aa_seqs.size(); }

	int get_max_tag_length() const { return max_tag_length; }
	int get_min_tag_length() const { return min_tag_length; }
	// void write_peptide(Peptide& pep, int protein_ix);
	int write_peptide(PeptideAdd &pep, int protein_idx, int position, int length, int complete_initialization = 0);
	/*int  test_peptide(const Config* config, int protein_idx, int position, int length, int complete_initialization = 0);*/
	int write_peptide(PeptideAdd &pep, PeptideCounter pep_counter, int comp_init = 1) {
		// cout << "comp_init lev 2 :" << comp_init << endl;
		return write_peptide(pep, pep_counter.protein_idx, pep_counter.position, pep_counter.length, comp_init);
	}

	void load_peptide(PeptideCounter& peptide, int comp_init = 1);
  void FilterDB(PeptideCounterList& pep_list, AnnotatedSpectrumCounterList& spectrum_vector, vector<float>& threshold, vector<float> offsets, vector<int> scoring_methods, vector<float> acc_thresh, Offset offset, int score_type);
  void set_log_file_string(string lfs) {log_file_string = lfs;}
  void set_temp_file_string(string tfs) {temp_file_string = tfs;}

        // void ScoreSingleSpec(PeptideCounterList& pep_list, AnnotatedSpectrumList& spectrum_vector, int spec_ind, vector<float>& threshold, vector<float> offsets, vector<int> scoring_methods, vector<float> acc_thresh, Offset offset, int score_type, string log_file_string, string temp_file_string);

 	int SearchPeptide(PeptideCounterList& pep_list, Peptide& pep);
	void make_database_fixed_sides(vector<int> aa_list, int num, int aa_first, int aa_last);
	void write_from_name_seq(string file_name);
	
        void create_test_fasta_file(char *temp_fasta_file, Config *conf, int len, char *pep_str, bool enforce_cys = false);

	void print_stats() const;
	void print_protein_names() const;
	

	int calc_tag_index(const vector<int>& tag_aas) const 
	{	int i,idx=0;
		const vector<int>& org_aa = config->get_org_aa();
		for (i=0; i<tag_aas.size()-1; i++)
		{
			idx += aa_codes[org_aa[tag_aas[i]]];
			idx *= mult_val;
		}
		idx+=aa_codes[org_aa[tag_aas[i]]];
		return idx;
	}
	

	// gets number and list of tag locations
	int get_tag_locations(const vector<int>& tag, int **list) const
	{
		const int tag_length = tag.size();
		if (tag_length<min_tag_length || tag_length>max_tag_length)
		{
			cout << "Tag length " << tag_length << " not supported!" << endl;
			exit(0);
		}
		const int idx = calc_tag_index(tag);

		INT2TLP_MAP::const_iterator iter = tag_maps[tag_length].find(idx);

		if (iter == tag_maps[tag_length].end())
			return 0;

		*list = (int *)(&tag_locations[tag_length][(*iter).second.list_start_idx]);
		return (*iter).second.num_locations;
	}

	// returns the protein idx for an amino acid location, -1 if invalid locaiton is given
	int get_protein_number(int loc, int *idx_in_protein = NULL) const
	{
		if (loc<0 || loc>all_aa_seqs.size())
			return -1;

		INT_MAP::const_iterator it;
		it = aa_seq_starts.lower_bound(loc);
		if (it != aa_seq_starts.end())
		{
			if (idx_in_protein)
			{
				if ((*it).first == loc)
				{
					*idx_in_protein = 0;
					return (*it).second;
				}
				else
				{
					it--;
					*idx_in_protein = loc - (*it).first;
					return (*it).second;
				}
			}
			else
			{
				return  (*it).first == loc ? (*it).second : (*it).second -1;
			}
			
		}

		return -1;
	}

	// gets pointer to sequnce location
	const int * get_aa_seq_pointer(int loc=0) const
	{
		return &all_aa_seqs[loc];
	}

	vector<int> get_protein_start_location() {
		return protein_start_location;
	}
	
	vector<int> get_protein_length() {
		return protein_length;
	}

	void print_aas_at_loc(int loc_idx, int num_aas = 10) const;

	void set_config(Config* conf) {config = conf;}
	Config *get_config() { return config; }

	void read_FastaDB(const char *file_name, Config *con);
	void write_FastaDB(const char *file_name) const;
	void set_print_pep_flag(bool ppf) {print_pep_flag = ppf;}
	void set_start_scan(int sf) {scan_first = sf; set_start_flag = true;}
	void set_stop_scan(int sl) {scan_last = sl; set_stop_flag = true;}
	void set_scan_set(vector<int> ss) {scan_set = ss; set_scan_flag = true;}
	void enforce_num_cys(int nc) {
		num_cys = nc;
		enforce_cys_flag = true;
	}
  vector<string> get_protein_names() {return protein_names;}
  TagVectorVector tag_vec_vec;
  void update_db_string();
  // void update_suffix_tree() {st.addText(db_string);}
	// SuffixTree st("");
  string db_string;
private:
  int min_tag_len;
  int max_tag_len;
	int num_cys;
	bool enforce_cys_flag;
	Config *config;
	char *fasta_file;
	int min_tag_length, max_tag_length;
 	bool print_pep_flag;
	int mult_val;  // the number in which we multiply the tag indices
	int scan_first;
	int scan_last;
	vector<int> scan_set;
	bool set_start_flag;
	bool set_stop_flag;
	bool set_scan_flag;
  string temp_file_string;
  string log_file_string;

	vector<int> aa_codes;        // for each amino acid holds an integer code
	INT_MAP aa_seq_starts;      // the starting idx of each sequence
	vector< int> protein_name_starts; // the start position of each protein name

	vector<int> all_aa_seqs; // holds the concatenated aa sequences for all proteins (seq_starts points to
						 // positions in this array)

	vector<char> all_protein_names; // holds concatenated char sequences of all protein names
	                                // (protein_name_starts points to positions in this array).

	vector<INT2TLP_MAP> tag_maps; // holds a map to each tag locations

	vector<vector<int> > tag_locations; // holds for each tag length a list of locations
										// which are organized according to the order of tags
										// (the entries of tag_hashes point to places in this list).
	vector <int> protein_start_location;
	vector <int> protein_number;
	vector <int> protein_length;
	vector <string> protein_names;
	vector <string> protein_seqs;
};



#endif


