#include "db_fasta.h"
#include "batch.h"

#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cmath>

char DB_fasta::emptyStr[] = "(empty)";

DB_fasta::~DB_fasta(){
	reset();
}

char *DB_fasta::operator[](char *id) {
    for(unsigned int i=0; i<IDs.size() ; i++)
        if (strcmp(id,IDs[i])==0) return sequences[i];
    return (char *)0;
}

void DB_fasta::replaceAA(char prevAA, char repAA) {
	for(unsigned int seqIdx=0; seqIdx<sequences.size(); seqIdx++) {
		unsigned int aaIdx=0;
		while(sequences[seqIdx][aaIdx])
		  { if(sequences[seqIdx][aaIdx]==prevAA) sequences[seqIdx][aaIdx]=repAA; aaIdx++; }
	}
}

Spectrum &DB_fasta::getMassesSpec(int index) {
    vector<float> tmp;
    if(masses[index].size()==0) {
        getMassesIdx(index,tmp);
        masses[index].peakList.resize(tmp.size()+1);
        masses[index][0].set((float)0,(float)0);   // Zero based cumsum of amino acid masses
        for(unsigned int i=0; i<tmp.size(); i++) masses[index][i+1].set(tmp[i]+masses[index][i][0],0);
        masses[index].parentMass = masses[index][masses[index].size()-1][0];   masses[index].parentCharge = 1;
    }
    return masses[index];
}

void DB_fasta::reset() {
	for(unsigned int seqIdx=0; seqIdx<IDs.size(); seqIdx++) {
		if(IDs[seqIdx]!=emptyStr) free(IDs[seqIdx]);
        if(desc[seqIdx]!=emptyStr) free(desc[seqIdx]);
        if(sequences[seqIdx]!=emptyStr) free(sequences[seqIdx]);
	}
}

unsigned int DB_fasta::Load(char *filename) {
	BufferedLineReader blr;
    unsigned int seqIdx, lineIdx, seqSize;
    char *token;
    list<unsigned int> seqSizes;

	if(blr.Load(filename)<=0) return 0;

	seqIdx = 0; seqSize=0;
	for(lineIdx=0; lineIdx<blr.size(); lineIdx++)
		if(blr.getline(lineIdx)[0]=='>') { seqIdx++; if(seqIdx>1) seqSizes.push_back(seqSize); seqSize=0; }
		else if(blr.getline(lineIdx)[0]!=';') seqSize+=strlen(blr.getline(lineIdx));
	seqSizes.push_back(seqSize);

    // Initialize data structures
	reset();
    IDs.resize(seqIdx);   sequences.resize(seqIdx);   desc.resize(seqIdx);   masses.resize(seqIdx);
    for(seqIdx=0; seqIdx<desc.size(); seqIdx++) { desc[seqIdx]=emptyStr; IDs[seqIdx]=emptyStr; sequences[seqIdx] = emptyStr; }
	
	// Load sequences
	seqIdx=0;
	for(lineIdx=0; lineIdx<blr.size() and blr.getline(lineIdx)[0]!='>'; lineIdx++);
	for(; lineIdx<blr.size(); lineIdx++)
		if(blr.getline(lineIdx)[0]=='>') {
			sequences[seqIdx] = (char *)malloc(seqSizes.front()+1);   if(!sequences[seqIdx]) { cerr<<"ERROR: Could not allocate "<<seqSizes.front()+1<<" bytes!\n"; return 0; }
			sequences[seqIdx][0]=0;  seqSizes.pop_front();
			
			token = strtok(&blr.getline(lineIdx)[1]," ");
			if(token) { 
				IDs[seqIdx] = (char*)malloc(strlen(token)+1);   if(!IDs[seqIdx]) { cerr<<"ERROR: Could not allocate "<<strlen(token)+1<<" bytes!\n"; return 0; }
				strcpy(IDs[seqIdx],token); 
			} else IDs[seqIdx] = emptyStr;
			token = strtok(NULL,"");
			if(token) {
				desc[seqIdx] = (char*)malloc(strlen(token)+1);  if(!desc[seqIdx]) { cerr<<"ERROR: Could not allocate "<<strlen(token)+1<<" bytes!\n"; return 0; }
				strcpy(desc[seqIdx],token); 
			} else desc[seqIdx] = emptyStr;
			seqSize=0; seqIdx++;
		} else if(seqIdx>0 and blr.getline(lineIdx)[0]!=0 and blr.getline(lineIdx)[0]!=';') {
			char *val = &(sequences[seqIdx-1][seqSize]);
			strcpy(&(sequences[seqIdx-1][seqSize]),blr.getline(lineIdx));
			seqSize+=strlen(blr.getline(lineIdx));
		}

    return(seqIdx);
}

void DB_fasta::output(ostream &out){
	for(unsigned int i=0; i<IDs.size(); i++)
		out <<IDs[i]<<":"<<desc[i]<<"\n"<<sequences[i]<<"\n";
}

// ***************************************************
// ***  DB_index methods  
// ***************************************************

static class Text2aa {  // Table to convert ascii codes (0-255) to amino acid indices (0-17)
	public:
	vector<char> convTable;
	Text2aa() {
		convTable.resize(256);
		for(unsigned int idx=0; idx<256; idx++) convTable[idx]=(char)0;
		convTable[65]=(char)0;    convTable[97]=(char)0;    // 'A', 'a'
		convTable[67]=(char)1;    convTable[99]=(char)1;    // 'C', 'c'
		convTable[68]=(char)2;    convTable[100]=(char)2;   // 'D', 'd'
		convTable[69]=(char)3;    convTable[101]=(char)3;   // 'E', 'e'
		convTable[70]=(char)4;    convTable[102]=(char)4;   // 'F', 'f'
		convTable[71]=(char)5;    convTable[103]=(char)5;   // 'G', 'g'
		convTable[72]=(char)6;    convTable[104]=(char)6;   // 'H', 'h'
		convTable[73]=(char)7;    convTable[105]=(char)7;   // 'I', 'i'
		convTable[75]=(char)8;    convTable[107]=(char)8;   // 'K', 'k'
		convTable[76]=(char)7;    convTable[108]=(char)7;   // 'L', 'l'
		convTable[77]=(char)9;    convTable[109]=(char)9;   // 'M', 'm'
		convTable[78]=(char)10;   convTable[110]=(char)10;  // 'N', 'n'
		convTable[80]=(char)11;   convTable[112]=(char)11;  // 'P', 'p'
		convTable[81]=(char)9;    convTable[113]=(char)9;   // 'Q', 'q'
		convTable[82]=(char)12;   convTable[114]=(char)12;  // 'R', 'r'
		convTable[83]=(char)13;   convTable[115]=(char)13;  // 'S', 's'
		convTable[84]=(char)14;   convTable[116]=(char)14;  // 'T', 't'
		convTable[86]=(char)15;   convTable[118]=(char)15;  // 'V', 'v'
		convTable[87]=(char)16;   convTable[119]=(char)16;  // 'W', 'w'
		convTable[89]=(char)17;   convTable[121]=(char)17;  // 'Y', 'y'
	}
	char &operator[](int idx) { return convTable[idx]; }
} text2aa;


void DB_index::hash_init(short tagLength) {
	coeffs1.resize(tagLength);   coeffs2.resize(tagLength);   srand(time(NULL));
	for(short cIdx=0; cIdx<tagLength; cIdx++) {
		coeffs1[cIdx]=short(32767.0*double(rand())/RAND_MAX); 
		coeffs2[cIdx]=short(32767.0*double(rand())/RAND_MAX);
	}
}

short DB_index::hash(short hashIdx, char *tag) {
	int index=0, cIdx;
	if(hashIdx==1) { 
		for(cIdx=0;cIdx<tagLength;cIdx++) index+=(coeffs1[cIdx]*((int)text2aa[tag[cIdx]]))%indexDim1; 
		return (short)(index%indexDim1);
	} else { 
		for(cIdx=0;cIdx<tagLength;cIdx++) index+=(coeffs2[cIdx]*((int)text2aa[tag[cIdx]]))%indexDim2; 
		return (short)(index%indexDim2);
	}
}


DB_index::DB_index(DB_fasta &db, short newIndexDim1, short newIndexDim2, short newTagLength) {
	buildIndex(db, newIndexDim1, newIndexDim2, newTagLength);
}
	
void DB_index::buildIndex(DB_fasta &db, short newIndexDim1, short newIndexDim2, short newTagLength) {
	vector<vector<list<Tag> > > tmpIndex;  // Temporary index (with lists instead of vectors)
	short c1,c2;                  // Index coordinates 1, 2 
	pair<int,short> tmpEntry;     // Temporary pair to hold (proteinID,tagPos) values for insertion
	char *protSeq;                // Pointer to current protein sequence 
	short lastTagPos, tagIndex;   // Position of last tag in current protein sequence and tag index
	list<Tag>::iterator tagIter;  // Iterator of tags in a collision list
	Tag curTag;                   // Holder for current tag being processed

	indexDim1=newIndexDim1; indexDim2=newIndexDim2; tagLength=newTagLength;
	hash_init(newTagLength);  //Initialize hash function coefficients for current tag length
	
	// Build new temporary index (with lists instead of vectors)
	tmpIndex.resize(indexDim1); 
	for(c1=0; c1<indexDim1; c1++) { tmpIndex[c1].resize(indexDim2); for(c2=0; c2<indexDim2; c2++) tmpIndex[c1][c2].clear(); }
	curTag.text = (char*)malloc(sizeof(char)*(tagLength+1)); curTag.text[tagLength]=char(0);
	for(int protIdx=0; protIdx<db.size(); protIdx++) {
		protSeq = db[protIdx];   lastTagPos = strlen(protSeq)-tagLength;
		tmpEntry.first = protIdx;
		for(tagIndex=0; tagIndex<=lastTagPos; tagIndex++) {
			strncpy(curTag.text,&protSeq[tagIndex],tagLength);
			c1 = hash(1,curTag.text); c2 = hash(2,curTag.text);
			tmpEntry.second = tagIndex;
			
			// check if current tag is already in the collision list
			bool tagProcessed = false;
			for(tagIter=tmpIndex[c1][c2].begin(); tagIter!=tmpIndex[c1][c2].end(); tagIter++)
				if(curTag==*tagIter) { tagIter->insts.push_back(tmpEntry); tagProcessed=true; 
					break; 
				}
			if(!tagProcessed)
			    { curTag.insts.clear(); curTag.insts.push_back(tmpEntry); tmpIndex[c1][c2].push_back(curTag); 
			    }

		}
	}
	
	// Build definitive index that uses vectors instead of lists
	index.resize(indexDim1); 
	for(c1=0; c1<indexDim1; c1++) { 
		index[c1].resize(indexDim2); 
		for(c2=0; c2<indexDim2; c2++) {
			if(tmpIndex[c1][c2].size()==0) continue;
			index[c1][c2].resize(tmpIndex[c1][c2].size());   tagIndex=0;
			for(list<Tag>::iterator iter=tmpIndex[c1][c2].begin(); iter!=tmpIndex[c1][c2].end(); iter++)
				index[c1][c2][tagIndex++] = *iter;
			tmpIndex[c1][c2].clear();
		}
	}
	for(c1=0; c1<indexDim1; c1++) tmpIndex[c1].resize(0); 
}
	
void DB_index::insert(char *tag, int proteinID, short tagPos) {
	
}

bool DB_index::find(char *tag, list<pair<int,short> > **location) {
	short c1,c2;                  // Index coordinates 1, 2 
	vector<Tag>::iterator tagIter;  // Iterator of tags in a collision list

	c1 = hash(1,tag); c2 = hash(2,tag);
	for(tagIter=index[c1][c2].begin(); tagIter!=index[c1][c2].end(); tagIter++)
		if(compareTags(tag,tagIter->text)) { (*location)=&(tagIter->insts); return true; }
	(*location)=(list<pair<int,short> > *) 0; return false;
}

bool DB_index::find(char *tag, DB_fasta &db, list<pair<int,string> > &peptides, short minMatchFlanking,
	                   float flankPref, float tolPref, float flankSuff, float tolSuff) {
	list<pair<int,short> > *location;   // peptides.clear(); // Allow addition of peptides to an existing set
	if(flankPref<-tolPref or flankSuff<-tolSuff) return false;
	if(not find(tag, &location)) return false;

	// Match flanking masses
	string curPeptide; curPeptide.reserve(128); pair<int,string> curPair;
	vector<float> protMasses;   short curMatchingFlanks;
	int aaIdx,pepStart,pepEnd;   float cumMass;
	for(list<pair<int,short> >::iterator iter=location->begin(); iter!=location->end(); iter++) {
		db.getMassesIdx(iter->first,protMasses);

		// Match prefix flanking mass
		aaIdx=iter->second;   cumMass=0;   curMatchingFlanks=0;
		while(fabs(cumMass-flankPref)>tolPref+0.0001 and cumMass<flankPref)
			if(aaIdx==0) break; else cumMass+=protMasses[--aaIdx];
		if(fabs(cumMass-flankPref)<=tolPref+0.0001) { curMatchingFlanks++; pepStart = aaIdx; }
		else { if(minMatchFlanking==2) continue; else pepStart = aaIdx+1; } // Default to modifications with positive mass offset
		
		// Match suffix flanking mass
		aaIdx=min(iter->second+tagLength,(int)protMasses.size());   cumMass=0;
		while(fabs(cumMass-flankSuff)>tolSuff+0.0001 and cumMass<flankSuff)
			if(aaIdx==(int)protMasses.size()) break; else cumMass+=protMasses[aaIdx++];
		if(fabs(cumMass-flankSuff)<=tolSuff+0.0001) { curMatchingFlanks++; pepEnd = aaIdx; }  // One aa past the end of the found peptide
		else pepEnd = aaIdx-1;
		if(curMatchingFlanks<minMatchFlanking) continue;
		
		curPeptide.resize(pepEnd-pepStart);
		for(aaIdx=0; aaIdx<(int)curPeptide.size(); aaIdx++) curPeptide[aaIdx]=db[iter->first][pepStart+aaIdx];
		curPair.first=iter->first; curPair.second=curPeptide; peptides.push_back(curPair);
	}
	return peptides.size()>0;
}

void MaximumParsimony(vector<list<int> > &proteinHits) {
  unsigned int pepIdx, protIdx=0;
  vector<list<int> > pepsPerProt;
  vector<int> assignments(proteinHits.size());
  list<int>::iterator iter;

  // Find maximum index of a matched protein
  for(pepIdx=0; pepIdx<proteinHits.size(); pepIdx++) { assignments[pepIdx]=-1;
    for(iter=proteinHits[pepIdx].begin(); iter!=proteinHits[pepIdx].end(); iter++)
      protIdx=max((int)protIdx,*iter);
  }
  pepsPerProt.resize(protIdx+1); 

  // Find the parsimony assignments
  unsigned int maxHits, maxHitsIdx;
  do {
    maxHits=0; maxHitsIdx=0;
    for(protIdx=0; protIdx<pepsPerProt.size(); protIdx++) pepsPerProt[protIdx].clear();

    for(pepIdx=0; pepIdx<proteinHits.size(); pepIdx++)
      for(iter=proteinHits[pepIdx].begin(); iter!=proteinHits[pepIdx].end(); iter++) {
	pepsPerProt[*iter].push_back(pepIdx);
	if(pepsPerProt[*iter].size()>maxHits) { maxHits=pepsPerProt[*iter].size(); maxHitsIdx=*iter; }
      }

    if(maxHits>0) {
      for(iter=pepsPerProt[maxHitsIdx].begin(); iter!=pepsPerProt[maxHitsIdx].end(); iter++)
	{ proteinHits[*iter].clear();   assignments[*iter]=maxHitsIdx; }
    }
  } while(maxHits>0);

  // Copy the assignments back to proteinHits
  for(pepIdx=0; pepIdx<proteinHits.size(); pepIdx++)
    if(assignments[pepIdx]>=0) proteinHits[pepIdx].push_back(assignments[pepIdx]);
}
