/*
 * Written by Bastien Chevreux (BaCh)
 *
 * Copyright (C) 1997-2000 by the German Cancer Research Center (Deutsches
 *   Krebsforschungszentrum, DKFZ Heidelberg) and Bastien Chevreux 
 *   and Thomas Pfisterer
 * Copyright (C) 2000 and later by Bastien Chevreux
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the 
 * Free Software Foundation, Inc., 
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * 
 */

#ifndef lint
static char vcid[] = "$Id$";
#endif /* lint */


#include "readpool.H"

#include <iostream>
#include <fstream>
#include <ctype.h>

#include "errorhandling/errorhandling.H"
#include "util/progressindic.H"

#include "io/generalio.H"
#include "io/fasta.H"
#include "io/fastq-lh.H"
#include "io/gbf.H"
#include "io/phd.H"
#include "io/ncbiinfoxml.H"

#include "gff_parse.H"

#include "boost/unordered_map.hpp" 


KSEQ_INIT(gzFile, gzread)


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::foolCompiler()
{
#include "stdinc/foolcompiler.C"
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
ReadPool::ReadPool(vector<MIRAParameters> * params)
{
  FUNCSTART("ReadPool::ReadPool()");

  REP_thepool.clear();
  //REP_filenames=NULL;
  REP_miraparams=params;

  // fill in the default strain (Id=0)
  REP_strainnames.push_back("default");

  REP_valid=REP_VALID;

  FUNCEND();
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::discard()
{
  FUNCSTART("ReadPool::discard()");

  //REP_miraparams=NULL;
  nukeSTLContainer(REP_thepool);
  nukeSTLContainer(REP_strainnames);

  //delete [] REP_filenames;

  FUNCEND();
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
ReadPool::~ReadPool()
{
  FUNCSTART("ReadPool::~ReadPool()");

  discard();
  
  FUNCEND();
}


//void ReadPool::setMIRAParameters(MIRAParameters * params)
//{
//  FUNCSTART("void ReadPool::setMIRAParameters(MIRAParameters * params);");
//
//  REP_miraparams=params;
//
//  FUNCEND();
//}



/*************************************************************************
 *
 *
 *
 *************************************************************************/

size_t ReadPool::estimateMemoryUsage() const
{
  FUNCSTART("size_t ReadPool::estimateMemoryUsage()");

  size_t ret=sizeof(ReadPool);
  ret+=estimateMemoryUsageOfContainer(REP_thepool,false);
  ret+=estimateMemoryUsageOfContainer(REP_strainnames,false);

  FUNCEND();
  return ret;
}


/*************************************************************************
 *
 * makes template IDs for the reads in the pool
 *
 * returns 
 *  true if there are usable templates (less templates than valid reads)
 *  false otherwise
 *
 *************************************************************************/

//struct eqstr
//{
//  bool operator()(const char* s1, const char* s2) const
//  {
//    return strcmp(s1, s2) == 0;
//  }
//}; 
//
//typedef hash_multimap<const char*, int, hash<const char *>, eqstr> stringhash_t; 
//typedef stringhash_t::value_type stringhash_entry_t;

bool ReadPool::makeTemplateIDs(bool verbose)
{
  FUNCSTART("void ReadPool::makeTemplateIDs()");

// TODO: bla auf backbone und rails eingehen

  if(verbose){
    rpDateStamp();
    cout << endl;
  }

  vector<int32> tidcounter;
  vector<int32> tid_firstpartner;
  tidcounter.resize(size(), 0);
  tid_firstpartner.resize(size(), -1);

  // will we need to check template ends?
  // rationale: if no info about template ends was available in the
  //  ancillary data, the no_te_check will ensure that we'll then still use
  //  the template information
  bool notecheck=true;
  for(uint32 i=0; i<size();i++){
    if(REP_thepool[i].getTemplateEnd()!='N') {
      notecheck=false;
    }
  }


  typedef boost::unordered_map<std::string, int32> strintmap;
  strintmap tnmap;
  strintmap::iterator tnI;

  int32 acttid=0;
  int32 validreads=0;

  for(uint32 i=0; i<size();i++){
      //cout << "acttid: " << acttid << "\t";
    if(REP_thepool[i].hasValidData()==false) continue;
    validreads++;

    tnI=tnmap.find(REP_thepool[i].getInternalTemplateName());
    if(tnI!=tnmap.end()){
      REP_thepool[i].setTemplateID(tnI->second);
      tidcounter[tnI->second]++;
        
      int32 firstpartner=tid_firstpartner[tnI->second];
      REP_thepool[i].setTemplatePartnerID(firstpartner);
      REP_thepool[firstpartner].setTemplatePartnerID(i);
    }else{
      REP_thepool[i].setTemplateID(acttid);

      tid_firstpartner[acttid]=i;

      tidcounter[acttid]++;
      tnmap[REP_thepool[i].getInternalTemplateName()]=acttid;
      acttid++;
    }
  }


  bool faultfound=false;
  for(uint32 i=0; i<size(); i++){
    CEBUG("Checking template " << i << "\t" << tidcounter[i] << endl);
    if(tidcounter[i]>2){
      acttid--;
      faultfound=true;
      cerr << "The reads ";
      for(uint32 j=0; j<size(); j++){
	if(REP_thepool[j].getTemplateID()==static_cast<int32>(i)){
	  // TODO: make configurable
	  cerr << REP_thepool[j].getName() << " ";
	  REP_thepool[j].setTemplateID(-1);
	  REP_thepool[j].setTemplatePartnerID(-1);
	}
      }
      cerr << "have the same template and MIRA was unable to differentiate between different trials. Template information not used there.\n";
    } else {
      //Read::setCoutType(Read::AS_TEXT);
      //cout << REP_thepool[i];
    }
  }

  if(verbose){
    cout << "Generated " << acttid << " unique template ids for " << validreads << " valid reads.\n";
  }

  if(acttid==validreads){
    FUNCEND();
    return false;
  }

  FUNCEND();
  return true;
}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/


void ReadPool::makeStrainIDs(bool verbose)
{
  FUNCSTART("void ReadPool::makeStrainIDs()");

  if(verbose){
    rpDateStamp();
    cout << endl;
  }

  //vector<int32> tidcounter;
  //tidcounter.resize(size(), 0);

  typedef boost::unordered_map<std::string, int8> strintmap;
  strintmap sidmap;
  strintmap::iterator sidI;

  // fill in the default strain (Id=0)
  REP_strainnames.clear();
  sidmap["default"]=REP_strainnames.size();
  REP_strainnames.push_back("default");

  
  // go through readpool in two rounds, first just looking at Solexa reads,
  //  then at all remaining reads
  // reason: let Solexa have the low strain IDs 0-7, so that they can
  //  be mapped with merge option in the contig

  for(uint32 round=0; round < 2;++round){
    for(uint32 rid=0; rid<size();++rid){
      if(REP_thepool[rid].hasValidData()==false) continue;
      if(REP_thepool[rid].isRail()) continue;
      if(REP_thepool[rid].getStrain().size()==0) continue;
      if(round==0 && REP_thepool[rid].getSequencingType()!=Read::SEQTYPE_SOLEXA) continue;
      if(round>0 && REP_thepool[rid].getSequencingType()==Read::SEQTYPE_SOLEXA) continue;
      sidI=sidmap.find(REP_thepool[rid].getStrain());
      if(sidI!=sidmap.end()){
	REP_thepool[rid].setStrainID(sidI->second);
      }else{
	if(REP_strainnames.size()==127){
	  MIRANOTIFY(Notify::FATAL, "More than 127 strains encountered? Sorry, not possible. Strain " << REP_thepool[rid].getStrain() << " in read " << REP_thepool[rid].getName() << "\n");
	}
	REP_thepool[rid].setStrainID(REP_strainnames.size());
	if(verbose){
	  cout << "Seeing strain " << REP_strainnames.size() << ": \"" << REP_thepool[rid].getStrain() << "\"" << endl;
	}
	sidmap[REP_thepool[rid].getStrain()]=REP_strainnames.size();
	REP_strainnames.push_back(REP_thepool[rid].getStrain());
      }
      //cout << "Read " <<REP_thepool[i].getName() << " has strain " << REP_thepool[i].getStrain() << endl;
    }
  }

  if(verbose){
    cout << "Generated " << REP_strainnames.size()-1 << " unique strain ids for " << size() << " reads.\n";
  }
  return;
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void ReadPool::dumpStrainIDSummary()
{
  FUNCSTART("void ReadPool::dumpStrainIDSummary()");
  vector<int32> tidcounter;
  tidcounter.resize(REP_strainnames.size(), 0);
  for(uint32 i=0; i<size();i++){
    if(REP_thepool[i].isRail()) continue;
    tidcounter[REP_thepool[i].getStrainID()]++;
  }

  for(uint32 sid=0; sid<REP_strainnames.size();sid++){
    cout << "Strain \"" << REP_strainnames[sid] << "\" has " << tidcounter[sid] << " reads.\n";
  }

  FUNCEND();
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

bool ReadPool::getStrainIDOfStrain(const string & strainname, int32 & sid) const
{
  for(sid=0; sid<REP_strainnames.size(); sid++){
    if(REP_strainnames[sid]==strainname) return true;
  }
  sid=-1;
  return false;
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
const string & ReadPool::getStrainOfStrainID(uint32 sid) const
{
  FUNCSTART("const string & ReadPool::getStrainOfStrainID(uint32 sid)");

  BUGIFTHROW(sid >= REP_strainnames.size(),"sid >= REP_strainnames.size()");

  FUNCEND();

  return REP_strainnames[sid];
}




/*************************************************************************
 *
 *
 * loadaction:
 *   //  0 = count only
 *   //  1 = count, adjust readpool capacity and load
 *   //  2 = load only
 *
 *
 *************************************************************************/

size_t ReadPool::loadEXPs(const string & fofn, const uint8 loadaction, uint32 & longestread, const uint8 readtype, void (*callback)(ReadPool &))
{
  FUNCSTART("size_t ReadPool::loadEXPs(const string & fofn, const uint8 loadaction, uint32 & longestread, const uint8 readtype)");

  cout << "Loading file of filenames: " << fofn << endl;

  // Load the file of filenames 
  ifstream fin;
  fin.open(fofn.c_str(), ios::in|ios::ate);
  if(!fin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << fofn);
  }

  uint32 len_fofn=fin.tellg();
  if(len_fofn==1){
    MIRANOTIFY(Notify::FATAL, "Zero length file: " << fofn);
  }
  fin.seekg(0, ios::beg);

  vector<string> names;
  {
    string filename, dummy;
    while(GeneralIO::readKeyValue(fin, filename, dummy)){
      names.push_back(filename);
    }
  }
  fin.close();

  bool stopprocessing=false;
  {
    //stringhash_t M;
    //pair<stringhash_t::const_iterator, stringhash_t::const_iterator> p;

    typedef boost::unordered_map<std::string, uint32> strintmap;
    strintmap namemap;
    strintmap::iterator nI;

    for(uint32 i=0; i< names.size(); i++){
      nI=namemap.find(names[i]);
      if(nI!=namemap.end()){
	cout << "WARNING: file " << names[i] << " is present more than once in your file of filenames." << endl;
	names[i].clear();
	stopprocessing=true;
      }else{
	namemap[names[i]]=i;
      }
    }
  }
  cout << "done." << endl;

  if(stopprocessing){
    MIRANOTIFY(Notify::FATAL, "Some reads lead to unrecoverable error: duplicate names. Aborting, see log above for further information.");
  }

  rpDateStamp();
  if(loadaction==0){
    if(longestread==0) return names.size();

    // if loadaction == 0 and longestread >0 : must load reads but not 
    //  store, only give back length of longest read
    
    cout << "Must analyse reads to find out about longest read length:\n";
    ProgressIndicator<int32> P(0, names.size()-1);

    longestread=0;
    Read dummy;
    for(uint32 i=0; i<names.size(); i++){
      P.progress(i);
      if(names[i].empty()) continue;
      try{
	if(REP_miraparams!=NULL) {
	  if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	     (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	    dummy.setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
				 (*REP_miraparams)[0].getDirectoryParams().dir_scf
	      );
	  }
	}
	dummy.loadDataFromEXP(names[i]);
	longestread=max(longestread,dummy.getLenClippedSeq());
      }
      catch(Flow){
      }
      catch(Notify n){
	stopprocessing=true;
	n.handleError(THISFUNC);
      }
    }
    rpDateStamp();
    return names.size();
  }

  // Now load all these files.
  size_t no_files_ok=0;
  longestread=0;
  {
    uint32 pooli=REP_thepool.size();
    if(loadaction==1) REP_thepool.reserve(REP_thepool.size()+names.size()+10);
    REP_thepool.resize(REP_thepool.size()+names.size());
    
    ProgressIndicator<int32> P(0, names.size()-1);
    
    cout << "Loading EXP files: " << endl;
    
    //string completename;
    for(uint32 i=0; i<names.size(); i++, pooli++){
      P.progress(i);
      if(names[i].empty()) continue;
      try{
	if(REP_miraparams!=NULL) {
	  if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	     (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	    REP_thepool[pooli].setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					      (*REP_miraparams)[0].getDirectoryParams().dir_scf
					  );
	  }
	}
	REP_thepool[pooli].setSequencingType(readtype);
	REP_thepool[pooli].loadDataFromEXP(names[i]);
	no_files_ok++;
	longestread=max(longestread, REP_thepool[pooli].getLenClippedSeq());

	if(REP_miraparams!=NULL) {
	  // Set standard insert size if not given in EXP
	  // TODO: adapt to different read types
	  if(REP_thepool[pooli].getInsizeFrom() == -1
	     && REP_thepool[pooli].getInsizeTo() == -1){
	    REP_thepool[pooli].setInsize(
	      (*REP_miraparams)[0].getContigParams().con_insertsize_minimum,
	      (*REP_miraparams)[0].getContigParams().con_insertsize_maximum);
	  }
	}
      }
      catch(Flow){
      }
      catch(Notify n){
	stopprocessing=true;
	n.handleError(THISFUNC);
      }
      if(callback!=NULL) {
	(*callback)(*this);
      }
    }
    P.finishAtOnce();
    
    cout << "\nDone.\n";
    
    cout << "There haven been " << names.size() << " files given, " ;
    cout << no_files_ok << " of which have loaded ok.\n";
  }

  rpDateStamp();
  cout << "Adjusting sequencing vector clips from tags:\n";
  ProgressIndicator<int32> P(0, REP_thepool.size());
    
  for(uint32 i=0; i<REP_thepool.size(); i++){
    P.progress(i);
    REP_thepool[i].transferSVTagsToClip(20,60);
  }
  P.finishAtOnce();

  cout << endl;

  FUNCEND();

  return no_files_ok;
}



/*************************************************************************
 *
 * if force == true, then load always the qualities from the SCF
 * if force == false, just load qualities from SCF if no other has already been loaded
 *
 * loadfailnoerror = true means that simple load errors (not found etc) do not
 *                    produce an error on stderr
 *                   hard errors (mismatch fasta <-> scf) still do!
 *
 *************************************************************************/
void ReadPool::loadQualitiesFromSCF(uint32 force, bool loadfailnoerror, const string & f1name, const string & f2name)
{
  FUNCSTART("void ReadPool::loadQualitiesFromSCF()");

  //cout << "### " << f1name << endl;
  //cout << "### " << f2name << endl;

  assembly_parameters const & as_params= (*REP_miraparams)[0].getAssemblyParams();

  ofstream f1out;
  f1out.open(f1name.c_str(), ios::out|ios::trunc);
  ofstream f2out;
  f2out.open(f2name.c_str(), ios::out|ios::trunc);

  ProgressIndicator<int32> P(0, REP_thepool.size()-1);

  if(force!=0){
    cout << "Checking SCF files and loading qualities:\n";
  }else{
    cout << "Checking SCF files (loading qualities only if needed):\n";
  }
  
  uint32 filesok=0;
  uint32 filesnotfound=0;
  uint32 filesbad=0;

  for(uint32 i=0; i<REP_thepool.size(); i++){
    P.progress(i);
    if(REP_thepool[i].hasValidData()==true){
      try{
	if(force!=0 || REP_thepool[i].hasQuality()==false){
	  REP_thepool[i].checkSCFAndLoadQual(false, loadfailnoerror);
	}else{
	  // just check if SCF file is there and can correctly be loaded
	  REP_thepool[i].checkSCFAndLoadQual(true, loadfailnoerror);
	}
	if(REP_thepool[i].hasSCFData()==false) {
	  if(!(REP_thepool[i].isBackbone() || REP_thepool[i].isRail())) {
	    f1out << REP_thepool[i].getSCFName() << endl;
	    filesnotfound++;
	  }
	} else {
	  filesok++;
	}
      }
      catch(Notify n){
	cerr << "Ouch, read " << REP_thepool[i].getName() << " produced this error:\n";
	n.handleError(THISFUNC);
	f2out << REP_thepool[i].getSCFName() << endl;
	filesbad++;
	if(as_params.as_discard_read_on_eq_error) {
	  REP_thepool[i].discard();
	}
      }
    }
  }
  P.finishAtOnce();
  
  f2out.close();
  f1out.close();

  cout << "\nDone." << endl;
  cout << filesok << " SCF files loaded ok.\n";
  if (filesnotfound) {
    cout << filesnotfound << " SCF files were not found (see '" << f1name << "' for a list of names).\n";
  }
  if (filesbad) {
    cout << filesbad << " SCF files do not match the data in the experiment file counterpart!!!\nYou really DO want to check back why (see logfile '" << f2name << "'\nfor a list of names and the stderr on why the load failed).\n";
    if(as_params.as_discard_read_on_eq_error) {
      cout << "Those reads were discarded from assembly.\n";
    }else{
      cout << "Reads kept in assembly, but only with default quality!\n";
    }
  }
  cout << "\n\n";

  FUNCEND();
}




/*************************************************************************
 *
 * if generatefilenames == true (default) tells the read object to generate
 *  filenames according to convention. Else does not (and saves memory)
 *  
 * Sets for each read its type
 *
 * only if SequencingType=Sanger 
 *  set filenames (regardless of generatefilenames). 
 * Saves ~ 235M in 454 project with ~1 mio reads (= ~17% of initial memory
 *  consumption)
 *
 * loadaction:
 *   //  0 = count only
 *   //  1 = count, adjust readpool capacity and load
 *   //  2 = load only
 *
 *************************************************************************/
size_t ReadPool::loadDataFromFASTQ(const string & filename, const uint8 loadaction, uint32 & longestread, const bool generatefilenames, const uint8 seqtype, const bool sxa_mustconvert, void (*callback)(ReadPool &))
{
  FUNCSTART("void ReadPool::loadDataFromFASTQ(const string & filename, const string & qualfilename, const bool generatefilenames, const uint8 seqtype, const uint8 loadaction)");

  bool fatalloaderror=false;

  gzFile fp;
  kseq_t *seq;

  rpDateStamp();

  size_t numseqsloaded=0;
  base_quality_t minqualseen=255;
  {
    fp = gzopen(filename.c_str(), "r");
    if(fp==Z_NULL) {
      MIRANOTIFY(Notify::FATAL,"Could not open FASTQ file '" << filename << "'. Is it present? Is it readable? Did you want to load your data in another format?");
    }    
    seq = kseq_init(fp);
    
    int l;
    cout << "Counting sequences in FASTQ file: "; cout.flush();
    
    // well, count sequences and also try guessing the FASTQ quality offset
    while ((l = kseq_read(seq)) >= 0) {
      numseqsloaded++;
      
      if(seq->qual.l && minqualseen>33){
	const char * qi = seq->qual.s;
	for(;*qi; qi++) if(*qi<minqualseen) minqualseen=*qi;
      }
      longestread=max(longestread,static_cast<uint32>(seq->qual.l));
    }
    
    cout << "found " << numseqsloaded << " sequences." << endl;

    kseq_destroy(seq);
    gzclose(fp);
  }

  rpDateStamp();

  if(loadaction==0) return numseqsloaded;

  if(loadaction==1){
    // safety margin
    REP_thepool.reserve(REP_thepool.size()+numseqsloaded+20);
  }

  {
    fp = gzopen(filename.c_str(), "r");
    seq = kseq_init(fp);

    bool qualerror=false;
    base_quality_t fqqualoffset;
    if((*REP_miraparams)[seqtype].getAssemblyParams().as_fastq_qualoffset ==0){
      fqqualoffset=minqualseen;
      if(fqqualoffset != 33 && fqqualoffset<50) {
	cout << "Unusual offset of " << static_cast<uint16>(fqqualoffset) << ", guessing this file to be a Sanger-type FASTQ format.\n";
	fqqualoffset=33;
      }
      cout << "Using calculated FASTQ quality offset: " << static_cast<int16>(fqqualoffset) << '\n';
    }else{
      fqqualoffset=(*REP_miraparams)[seqtype].getAssemblyParams().as_fastq_qualoffset;
      cout << "Using given FASTQ quality offset: " << static_cast<int16>(fqqualoffset) << '\n';
    }
    bool fqqualoffseterror=false;

    // solexa 1.0 format must be converted specially, later formats not anymore
    bool issolexa10format=false;
    if(seqtype==Read::SEQTYPE_SOLEXA){
      if(fqqualoffset>50 && fqqualoffset < 63) {
	issolexa10format=true;
	cout << "Guessing FASTQ quality values to be in Illumina/Solexa 1.0 format.\n";
      }
    }

    vector<base_quality_t> bq;
    bq.reserve(1000);
    vector<int32> bq10sxaformat;
    bq10sxaformat.reserve(1000);
    
    rpDateStamp();

    cout << "Loading data from FASTQ file:" << endl;
    ProgressIndicator<int32> P(0, numseqsloaded);

    int l;
    while ((l = kseq_read(seq)) >= 0) {
      P.increaseprogress();
      addNewEmptyRead();
      //cout << "Name: " << (char*) name.begin() << endl;
      
      if(seqtype == Read::SEQTYPE_SOLEXA){
	REP_thepool.back().disallowAdjustments();
      }

      // set read naming scheme according to default
      if(REP_miraparams!=NULL) {
	REP_thepool.back().setReadNamingScheme((*REP_miraparams)[seqtype].getAssemblyParams().as_readnaming_scheme);
      }

      if(generatefilenames && seqtype==Read::SEQTYPE_SANGER) {
	REP_thepool.back().setFileNamesFromFASTAName(seq->name.s);
      }else{
	REP_thepool.back().setName(seq->name.s);
      }

      if(REP_thepool.back().getName().empty()){
	cout << "Ouch, there's a read without a name? This is illegal. The sequence\n  " 
	     << seq->seq.s 
	     << "\nmust have a name!\n";
	fatalloaderror=true;
      }

      if(seq->seq.l==0){
	REP_thepool.back().setValidData(false);
      }else{
	REP_thepool.back().setSequenceFromString(seq->seq.s);
	if(seq->qual.l){
	  if(seq->qual.l == seq->seq.l){
	    bq.clear();
	    const char * qi = seq->qual.s;
	    bool noerrormsgprinted=true;
	    for(;*qi; qi++) {
	      if(*qi<fqqualoffset) {
		fqqualoffseterror=true;
		if(noerrormsgprinted){
		  cout << REP_thepool.back().getName() 
		       << ": some quality values in FASTQ are below the chosen FASTQ offset of "
		       << static_cast<uint16>(fqqualoffset) << '\n';
		  noerrormsgprinted=false;
		}
		bq.push_back(0);
	      }else{
		bq.push_back(*qi - fqqualoffset);
	      }
	    }
	    if(issolexa10format){
	      bq10sxaformat.clear();
	      for(uint32 i=0;i<bq.size();i++){
		bq10sxaformat.push_back(static_cast<int32>(bq[i]));
		solexaScoreToQual(bq10sxaformat,bq);
	      }
	    }
	    REP_thepool.back().setQualities(bq);
	  }else{
	    cout << REP_thepool.back().getName() 
		 << ": different number of quality values than bases?\n";
	    qualerror=true;
	  }
	}
	if(generatefilenames 
	   && seqtype==Read::SEQTYPE_SANGER 
	   && REP_miraparams!=NULL) {
	  if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	     (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	    REP_thepool.back().setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					      (*REP_miraparams)[0].getDirectoryParams().dir_scf
	      );
	  }
	}
      }

      REP_thepool.back().setSequencingType(seqtype);

      if(REP_miraparams!=NULL) {
	// Set standard insert size if not given otherwise
	// TODO: adapt to different read types
	if(REP_thepool.back().getInsizeFrom() == -1
	   && REP_thepool.back().getInsizeTo() == -1){
	  REP_thepool.back().setInsize(
	    (*REP_miraparams)[seqtype].getContigParams().con_insertsize_minimum,
	    (*REP_miraparams)[seqtype].getContigParams().con_insertsize_maximum);
	}

	REP_thepool.back().setTemplateBuildDirection(
	  (*REP_miraparams)[seqtype].getContigParams().con_template_build_direction);
      }

      if(callback!=NULL) {
	(*callback)(*this);
      }
    }
    P.finishAtOnce();
    cout << "\n";

    kseq_destroy(seq);
    gzclose(fp);

    if(qualerror){
      MIRANOTIFY(Notify::FATAL,"Unrecoverable error while loading data from FASTQ. Fix your input please.");
    }
    if(fqqualoffseterror){
      MIRANOTIFY(Notify::FATAL,"You might want to set the FASTQ offset for this technology to a lower value (-LR:fqqo=...).");
    }
  }

  cout << "\nDone.\n";
  
  cout << "Loaded " << numseqsloaded << " reads, " ;
  //cout << num_reads_qual_ok << " of which have quality accounted for.\n";

  rpDateStamp();

  if(fatalloaderror) {
    MIRANOTIFY(Notify::FATAL, "Fatal error encountered during load of data (see log), aborting.\n") ;
  }

  FUNCEND();

  return numseqsloaded;
}


/*************************************************************************
 *
 * if generatefilenames == true (default) tells the read object to generate
 *  filenames according to convention. Else does not (and saves memory)
 *  
 * Sets for each read its type
 *
 * only if SequencingType=Sanger 
 *  set filenames (regardless of generatefilenames). 
 * Saves ~ 235M in 454 project with ~1 mio reads (= ~17% of initial memory
 *  consumption)
 *
 * loadaction:
 *   //  0 = count only
 *   //  1 = count, adjust readpool capacity and load
 *   //  2 = load only
 *
 *************************************************************************/
size_t ReadPool::loadDataFromFASTA(const string & filename, const uint8 loadaction, uint32 & longestread, const bool wantsqualfiletoexist, const string & qualfilename, const bool generatefilenames, const uint8 seqtype, const bool sxa_mustconvert, void (*callback)(ReadPool &))
{
  FUNCSTART("size_t ReadPool::loadDataFromFASTA(const string & filename, const uint8 loadaction, uint32 & longestread, const bool wantsqualfiletoexist, const string & qualfilename, const bool generatefilenames, const uint8 seqtype, const bool sxa_mustconvert)");

  ifstream fin;

  bool callcallbackforeachread=true;
  bool hasqualfile=false;
  if(qualfilename.size()!=0){
    fin.open(qualfilename.c_str(), ios::in|ios::ate);
    if(!fin){
      cout << "Could not find FASTA quality file " << qualfilename.c_str();
      if(wantsqualfiletoexist){
	cout << ", aborting. If you want to work without qualities, use -LR:wqf=no\n";
	MIRANOTIFY(Notify::FATAL, "File not found: " << qualfilename);
      }else{
	cout << ", using default values for these reads.\n";
      }
    }else{
      hasqualfile=true;
      callcallbackforeachread=false;
    }
    if(!fin.tellg() && wantsqualfiletoexist){
      MIRANOTIFY(Notify::FATAL, "FASTA quality file " << qualfilename << " has zero length? Seems fishy.");
    }
    fin.close();
    fin.clear();
  }else{
    if(wantsqualfiletoexist){
      MIRANOTIFY(Notify::FATAL, "FASTA quality file expected to exist, but no quality filename given???");
    }
  }

  fin.open(filename.c_str(), ios::in|ios::ate);
  if(!fin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << filename);
  }
  if(!fin.tellg()){
    MIRANOTIFY(Notify::FATAL, "Zero length file: "  << filename);
  }

  bool fatalloaderror=false;

  // these two are for the automatic conversion of Solexa base scores
  //  to quality values
  bool sxa_foundnegativevalue=false;

  streamsize fsize=fin.tellg();

  uint32 formerpoolsize=REP_thepool.size();

  // qualset looks whether the loaded fastas had quality values
  vector<bool> qualset;    
  if(!REP_thepool.empty()) qualset.resize(REP_thepool.size(),true);

  rpDateStamp();

  size_t numseqsloaded=0;
  size_t numbasesloaded=0;
  if(loadaction<2){
    cout << "Counting sequences in FASTA file:\n";

    bool noerrormsgprinted=true;

    fin.seekg(0, ios::beg);
    ProgressIndicator<streamsize> P(0, fsize,1000);
    FASTA thefasta;
    while(1){
      thefasta.loadNextSeq(fin);
      if(P.delaytrigger()) P.progress(fin.tellg());
      if(thefasta.testIfEmpty()) {
	break;
      }
      longestread=max(longestread,static_cast<uint32>(thefasta.getSequence().size()));
      if(thefasta.getSeqName().empty()){
	cout << "Ouch, there's a read without a name? This is illegal. The sequence\n  " 
	     << thefasta.getSequence()
	     << "\nmust have a name! It's read number " << numseqsloaded+1 << endl;
	noerrormsgprinted=false;
      }

      numseqsloaded++;
    }
    P.finishAtOnce();
    cout << "\nFound " << numseqsloaded << " sequences." << endl;

    if(!noerrormsgprinted){
      MIRANOTIFY(Notify::FATAL, "Fatal error encountered during load of data (see log), aborting.\n") ;
    }
  }

  rpDateStamp();
  if(loadaction==0) return numseqsloaded;

  if(loadaction==1){
    // safety margin
    REP_thepool.reserve(REP_thepool.size()+numseqsloaded+20);
  }

  numseqsloaded=0;
  {
    cout << "Loading data from FASTA file:\n";
    fin.clear();
    fin.seekg(0, ios::beg);
    ProgressIndicator<streamsize> P(0, fsize-1,1000);
    FASTA thefasta;
    while(1){
      thefasta.loadNextSeq(fin);
      if(P.delaytrigger()) P.progress(fin.tellg());
      if(thefasta.testIfEmpty()) {
	// no more sequences.
	break;
      }
      ++numseqsloaded;
      addNewEmptyRead();
      //cout << "Name: " << (char*) name.begin() << endl;
      
      if(seqtype == Read::SEQTYPE_SOLEXA){
	REP_thepool.back().disallowAdjustments();
      }

      // set read naming scheme according to default
      if(REP_miraparams!=NULL) {
	REP_thepool.back().setReadNamingScheme((*REP_miraparams)[seqtype].getAssemblyParams().as_readnaming_scheme);
      }

      if(generatefilenames && seqtype==Read::SEQTYPE_SANGER) {
	REP_thepool.back().setFileNamesFromFASTAName(thefasta.getSeqName());
      }else{
	REP_thepool.back().setName(thefasta.getSeqName());
      }

      if(thefasta.getSequence().empty()){
	REP_thepool.back().setValidData(false);
	cout << "\nWarning: " << thefasta.getSeqName() << " has no bases?! This usually points at some error in the processing of data before it arrives to MIRA.\n";
      }else{
	REP_thepool.back().setSequenceFromString(thefasta.getSequence());
	numbasesloaded+=thefasta.getSequence().size();
	if(generatefilenames 
	   && seqtype==Read::SEQTYPE_SANGER 
	   && REP_miraparams!=NULL) {
	  if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	     (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	    REP_thepool.back().setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					      (*REP_miraparams)[0].getDirectoryParams().dir_scf
	      );
	  }
	}
      }

      REP_thepool.back().setSequencingType(seqtype);

      if(REP_miraparams!=NULL) {
	// standard quality at first, may be overwritten later in load stage
	base_quality_t bdq=(*REP_miraparams)[seqtype].getAssemblyParams().as_basedefaultqual;

	if(REP_thepool.back().hasValidData()
	 && !(REP_thepool.back().isBackbone() || REP_thepool.back().isRail())) {
	  REP_thepool.back().setQualities(bdq);
	  REP_thepool.back().setQualityFlag(false);
	}

	// Set standard insert size if not given
	if(REP_thepool.back().getInsizeFrom() == -1
	   && REP_thepool.back().getInsizeTo() == -1){
	  REP_thepool.back().setInsize(
	    (*REP_miraparams)[seqtype].getContigParams().con_insertsize_minimum,
	    (*REP_miraparams)[seqtype].getContigParams().con_insertsize_maximum);
	}

	REP_thepool.back().setTemplateBuildDirection(
	  (*REP_miraparams)[seqtype].getContigParams().con_template_build_direction);
      }

      if(callcallbackforeachread && callback!=NULL) {
	(*callback)(*this);
      }
    }
    P.finishAtOnce();
    cout << "\n";
  }

  fin.close();

  rpDateStamp();


  //stringhash_t M;
  //pair<stringhash_t::const_iterator, stringhash_t::const_iterator> p;

  typedef boost::unordered_map<std::string, int32> strintmap;
  strintmap rnmap;
  strintmap::iterator rnI;

  // make a quick hash lookup of read names
  {
    bool haserror=false;
    for(uint32 i=0; i<size();i++){
      if(REP_thepool[i].hasValidData()==false) continue;
      if(REP_thepool[i].getName().size()==0) continue;
      rnI=rnmap.find(REP_thepool[i].getName());
      if(rnI!=rnmap.end()){
	//cout << "uh oh ... double?";
	haserror=true;
	rnI->second+=1;
      }else{
	rnmap[REP_thepool[i].getName()]=1;;
      }
    }

    if(haserror){
      for(rnI=rnmap.begin(); rnI!=rnmap.end(); ++rnI){
	if(rnI->second > 1){
	  cout << "Error: read name " << rnI->first << " present " << rnI->second << " times in readpool!\n";
	}
      }
      MIRANOTIFY(Notify::FATAL, "Read names not unique (either in this file or together with files loaded earlier): " << filename);
      
    }

    // now re-fill the rnmap with the read-ids
    rnmap.clear();
    cout << "rnm size: " << rnmap.size() << endl;
    for(uint32 i=0; i<size();i++){
      if(REP_thepool[i].hasValidData()==false) continue;
      if(REP_thepool[i].getName().size()==0) continue;
      rnmap[REP_thepool[i].getName()]=i;
    }

    //cout << "\n---\n";
    //rnI=rnmap.begin();
    //for(; rnI != rnmap.end(); ++rnI){
    //  cout << "rnif" << rnI->first << "\trnis: " << rnI->second << endl;
    //}
    //cout << "---\n";
  }

  //dumpAsMAF(cout);

  qualset.resize(REP_thepool.size(),false);
  int32 num_reads_qual_ok=0;
  if(hasqualfile && REP_thepool.size()) {
    FASTA thefasta;

    fin.clear();
    fin.open(qualfilename.c_str(), ios::in|ios::ate);

    if(!fin){
      cout << "Could not find FASTA quality file " << qualfilename.c_str();
      if(wantsqualfiletoexist){
	cout << ", aborting. If you want to work without qualities, use -LR:wqf=no\n";
	fatalloaderror=true;
      }else{
	cout << ", using default values for these reads.\n";
      }
    }
    if(!fin){
      cout << "Could not find FASTA quality file " << qualfilename.c_str() << " although I found it before loading the FASTA data???\nStrange ... please check what could have happened.\n";
      fatalloaderror=true;
    } else {
      cout << "Loading quality data from FASTA quality file " << qualfilename << ":\n";

      fsize=fin.tellg();
      if(fsize==0) fsize=1;
      fin.seekg(0, ios::beg);

      ProgressIndicator<streamsize> P(0, fsize-1,1000);

      while(1){
	thefasta.loadNextINTSeq(fin,255);
	if(P.delaytrigger()) P.progress(fin.tellg());
	if(thefasta.testIfEmpty()) break;
	try{
	  rnI=rnmap.find(thefasta.getQualName());
	  if(rnI!=rnmap.end()){
	    //cout << "Wanna set rnif" << rnI->first << "\trnis: " << rnI->second << endl;
	    if(seqtype==Read::SEQTYPE_SOLEXA){
	      const vector<int32> & svalues=thefasta.getINTValues();
	      vector<int32>::const_iterator sI=svalues.begin();
	      bool foundnegative=false;
	      if(!sxa_foundnegativevalue){
		for(;sI!=svalues.end(); sI++){
		  if(*sI<0) {
		    foundnegative=true;
		    sxa_foundnegativevalue=true;
		  }
		}
	      }
	      if(sxa_mustconvert){
		vector<base_quality_t> q;
		solexaScoreToQual(svalues,q);
		REP_thepool[rnI->second].setQualities(q);
	      }else{
		if(foundnegative){
		  cerr << "\n----------------------------\n";
		  cerr << thefasta.getQualName() << " has negative quality values although we are supposed to load new Solexa scores (phred, >=0). Could these be old Solexa scores instead?\n";
		  cerr << "\n----------------------------\n";
		  MIRANOTIFY(Notify::FATAL, "Found negative values in file where only values >= 0 were expected.\n") ;
		}
		REP_thepool[rnI->second].setQualities(thefasta.getQualities());
	      }
	    }else{
	      REP_thepool[rnI->second].setQualities(thefasta.getQualities());
	    }
	    qualset[rnI->second]=true;
	    num_reads_qual_ok++;
	  }else{
	    // TODO: make a WARNINGS file
	    cout << "Warning: " << thefasta.getQualName() << " has quality values, but was not present in sequence file?!\n";
	  }
	}
	catch(Notify n){
	  if(n.gravity==Notify::FATAL) fatalloaderror=true;
	  n.setGravity(Notify::WARNING);
	  n.handleError(THISFUNC);
	}
      }
      P.finishAtOnce();

      cout << "\n";

      //if(seqtype==Read::SEQTYPE_SOLEXA && !sxa_foundnegativevalue){
      //	cerr << "No negative values found in FASTA quality file of Solexa data??\n";
      //}
    }
  }else{
    cout << "No FASTA quality file given, using default qualities for all reads just loaded." << endl;
  }

  rpDateStamp();

//  if(REP_miraparams!=NULL) {
//    base_quality_t bdq=(*REP_miraparams)[seqtype].getAssemblyParams().as_basedefaultqual;
//    for(uint32 i=formerpoolsize; i<qualset.size(); i++){
//      if(qualset[i]==false
//	 && REP_thepool[i].hasValidData()
//	 && !(REP_thepool[i].isBackbone() || REP_thepool[i].isRail())) {
//	if(hasqualfile) cout << REP_thepool[i].getName() << " has no valid qualities, using default.\n";
//	REP_thepool[i].setQualities(bdq);
//	REP_thepool[i].setQualityFlag(false);
//      }
//    }
//  }

  if(REP_miraparams!=NULL) {
    for(uint32 i=formerpoolsize; i<qualset.size(); i++){
      if(qualset[i]==false
	 && REP_thepool[i].hasValidData()
	 && !(REP_thepool[i].isBackbone() || REP_thepool[i].isRail())) {
	if(hasqualfile) cout << REP_thepool[i].getName() << " has no valid qualities, using default.\n";
      }
    }
  }


  cout << "\nDone.\n";
  
  cout << "Loaded " << numseqsloaded << " reads with " 
       << numbasesloaded << " raw bases.\n"
       << num_reads_qual_ok << " reads have quality accounted for.\n";

  if(fatalloaderror) {
    MIRANOTIFY(Notify::FATAL, "Fatal error encountered during load of data (see log), aborting.\n") ;
  }

  if(!callcallbackforeachread && callback!=NULL) {
    (*callback)(*this);
  }

  FUNCEND();
  return numseqsloaded;
}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
// TODO:
// ugly, slow, replace with lookup table
void ReadPool::solexaScoreToQual(const vector<int32> & s, vector<base_quality_t> & q)
{
  FUNCSTART("void ReadPool::solexaScoreToQual(const vector<int32> & s, vector<base_quality_t> & q)");

  vector<int32>::const_iterator sI=s.begin();
  q.clear();
  q.reserve(s.size());
  for(;sI!=s.end(); sI++){
    if(*sI<-9){
      q.push_back(0);
    }else if(*sI<-3){
      q.push_back(1);
    }else if(*sI>=10){
      q.push_back(*sI);
    }else if(*sI>=4){
      q.push_back(*sI+1);
    }else{
      switch(*sI){
      case -3 : 
      case -2 : {
	q.push_back(2);
	break;
      }
      case -1 : 
      case 0 :  {
	q.push_back(3);
	break;
      }
      case 1 : 
      case 2 :  {
	q.push_back(4);
	break;
      }
      case 3 :  {
	q.push_back(5);
	break;
      }
      default: {
	MIRANOTIFY(Notify::INTERNAL, "Oooops, this branch should never be run?!");
      }
      }
    }
    //cout << *sI << " == " << static_cast<uint16>(q.back()) << endl;
  }
  FUNCEND();
}


/*************************************************************************
 *
 *
 * loadaction:
 *   //  0 = count only
 *   //  1 = count, adjust readpool capacity and load
 *   //  2 = load only
 *
 *
 *************************************************************************/
size_t ReadPool::loadPHD(const string & filename, const uint8 loadaction, uint32 & longestread, void (*callback)(ReadPool &))
{
  FUNCSTART("void ReadPool::loadDataFromPHD(const string & filename, const uint8 loadaction)");

  ifstream fin;
  fin.open(filename.c_str(), ios::in|ios::ate);
  if(!fin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << filename);
  }

  if(!fin.tellg()){
    MIRANOTIFY(Notify::FATAL, "Zero length file: " << filename);
  }

  std::streamoff fsize=fin.tellg();

  ProgressIndicator<std::streamoff> P(0, fsize-1,100);
  PHD thephd;



  // step 1= counting
  // 0=loading
  // -1 away
  size_t numseqsloaded=0;

  if(loadaction==0){
    cout << "Counting sequence data from PHD file:\n";
    fin.seekg(0, ios::beg);
    P.reset(0,fsize-1);
    
    while(1){
      try {
	thephd.loadNextSeq(fin);
      }
      catch (Notify n) {
	if(numseqsloaded==0) {
	  n.handleError(THISFUNC);
	}
      }
      // FIXME: check: reading tellg() seems to reset the read pointer on my home system
      if(P.delaytrigger()) P.progress(fin.tellg());
      if(thephd.testIfEmpty()) {
	// no more sequences.
	break;
      }
      longestread=max(longestread, static_cast<uint32>(thephd.getSequence().size()));
      numseqsloaded++;
    }
  }

  P.finishAtOnce();
  cout << endl;

  // ok, if we just counted, go away
  if(loadaction==0) return numseqsloaded;

  // if wanted, reserve enough space in the readpool
  if(loadaction==1){
    REP_thepool.reserve(REP_thepool.size()+numseqsloaded+10);
  }
    

  // need clear() to recover from EOF in first pass
  fin.clear();
  fin.seekg(0, ios::beg);
  P.reset(0,fsize-1);
  
  numseqsloaded=0;
  cout << "Loading sequence data from PHD file:\n";
  while(1){
    try {
      thephd.loadNextSeq(fin);
    }
    catch (Notify n) {
      if(numseqsloaded==0) {
	n.handleError(THISFUNC);
      }
    }
    // FIXME: check: reading tellg() seems to reset the read pointer on my home system
    if(P.delaytrigger()) P.progress(fin.tellg());
    if(thephd.testIfEmpty()) {
      // no more sequences.
      break;
    }
    numseqsloaded++;

    // okay, add a new read as we're in loading step
    addNewEmptyRead();
    //cout << "Name: " << thephd.getName() << endl;
    REP_thepool.back().setFileNamesFromFASTAName(thephd.getName());
    REP_thepool.back().setSequenceFromString(thephd.getSequence());
    REP_thepool.back().setQualities(thephd.getQualities());
    if(REP_miraparams!=NULL) {
      if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	 (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	REP_thepool.back().setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					  (*REP_miraparams)[0].getDirectoryParams().dir_scf
	  );
      }
      
      // Set standard insert size if not given
      // TODO: adapt to different read types
      if(REP_thepool.back().getInsizeFrom() == -1
	 && REP_thepool.back().getInsizeTo() == -1){
	REP_thepool.back().setInsize(
	  (*REP_miraparams)[0].getContigParams().con_insertsize_minimum,
	  (*REP_miraparams)[0].getContigParams().con_insertsize_maximum);
      }

    }
    //cout << REP_thepool.back();

    if(callback!=NULL) {
      (*callback)(*this);
    }
  }

  P.finishAtOnce();
  cout << endl;
  
  fin.close();
  
  cout << "\nDone.\n";
  
  cout << "There haven been " << numseqsloaded << " reads loaded from " << filename << endl;
  
  FUNCEND();
  return numseqsloaded;
}





/*************************************************************************
 *
 * GBF is one of the formats which may contain several sequences
 *  the object already loads them completely into memory
 *
 *************************************************************************/

void ReadPool::loadDataFromGBF(const string & filename)
{
  FUNCSTART("void ReadPool::loadDataFromGBF(const string & filename)");

  GBF thegbf;
  thegbf.load(filename);
  thegbf.transferGeneInfoToCDSInfo();

  REP_thepool.reserve(REP_thepool.size()+thegbf.getNumSequences()+10);
  for(uint32 i=0; i<thegbf.getNumSequences(); i++){
    addNewEmptyRead();

    //cout << "Read GBF " << i << endl;
    //cout << "\tName   : " << thegbf.getSequenceName(i) << endl;
    //cout << "\tLenseq : " << thegbf.getSequence(i).size() << endl;
    //cout << "\tNumtags: " << thegbf.getTags(i).size() << endl;

    REP_thepool.back().setFileNamesFromFASTAName(thegbf.getSequenceName(i));
    REP_thepool.back().setSequenceFromString(thegbf.getSequence(i));
    REP_thepool.back().setTags(thegbf.getTags(i));

    if(REP_miraparams!=NULL) {
      if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	 (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	REP_thepool.back().setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					  (*REP_miraparams)[0].getDirectoryParams().dir_scf
	  );
      }

      // Set standard insert size if not given
      // TODO: adapt to different read types
      if(REP_thepool.back().getInsizeFrom() == -1
	 && REP_thepool.back().getInsizeTo() == -1){
	REP_thepool.back().setInsize(
	  (*REP_miraparams)[0].getContigParams().con_insertsize_minimum,
	  (*REP_miraparams)[0].getContigParams().con_insertsize_maximum);
      }

      REP_thepool.back().setTemplateBuildDirection(
	(*REP_miraparams)[0].getContigParams().con_template_build_direction);
    }
  }

  FUNCEND();
}

/*************************************************************************
 *
 * GFF3 is one of the formats which may contain several sequences
 *  the object already loads them completely into memory
 *
 *************************************************************************/

void ReadPool::loadDataFromGFF3(const string & filename)
{
  FUNCSTART("void ReadPool::loadDataFromGBF(const string & filename)");

  GFFParse thegff;
  thegff.loadFile(filename);

  REP_thepool.reserve(REP_thepool.size()+thegff.getNumSequences()+10);
  for(uint32 i=0; i<thegff.getNumSequences(); i++){
    addNewEmptyRead();

    //cout << "Read GBF " << i << endl;
    //cout << "\tName   : " << thegff.getSequenceName(i) << endl;
    //cout << "\tLenseq : " << thegff.getSequence(i).size() << endl;
    //cout << "\tNumtags: " << thegff.getTags(i).size() << endl;

    REP_thepool.back().setFileNamesFromFASTAName(thegff.getSequenceName(i));
    REP_thepool.back().setSequenceFromString(thegff.getSequence(i));
    REP_thepool.back().setTags(thegff.getTags(i));

    if(REP_miraparams!=NULL) {
      if((*REP_miraparams)[0].getDirectoryParams().dir_exp.size()!=0 ||
	 (*REP_miraparams)[0].getDirectoryParams().dir_scf.size()!=0 ) {
	REP_thepool.back().setDirectories((*REP_miraparams)[0].getDirectoryParams().dir_exp,
					  (*REP_miraparams)[0].getDirectoryParams().dir_scf
	  );
      }

      // Set standard insert size if not given
      // TODO: adapt to different read types
      if(REP_thepool.back().getInsizeFrom() == -1
	 && REP_thepool.back().getInsizeTo() == -1){
	REP_thepool.back().setInsize(
	  (*REP_miraparams)[0].getContigParams().con_insertsize_minimum,
	  (*REP_miraparams)[0].getContigParams().con_insertsize_maximum);
      }

      REP_thepool.back().setTemplateBuildDirection(
	(*REP_miraparams)[0].getContigParams().con_template_build_direction);
    }
  }

  FUNCEND();
}

/*************************************************************************
 *
 * loads external strain data (should these not be provided by the EXPs)
 *
 * the data must be in a key-value file
 * key can be: read name, or filename of exp read or file name of caf read
 *   (may not contain spaces, sorry)
 * value is: s string describing the strain name (may contain spaces)
 * line with # as first nonwhitespace character are comments and read over
 *
 *************************************************************************/

void ReadPool::loadStrainData(const string & sdfile)
{
  FUNCSTART("void ReadPool::loadStrainData(const string & sdfile)");

  //stringhash_t M;

  typedef boost::unordered_map<std::string, uint32> strintmap;
  strintmap rnmap;
  strintmap::iterator rnI;
  
  cout << "Building hash table ... "; cout.flush();
  for(uint32 i=0; i<size();i++){
    if(!REP_thepool[i].getName().empty()) {
      rnmap[REP_thepool[i].getName()]=i;
      //cout << "Inserted1: " << REP_thepool[i].getName() << endl;
    }
  }
  cout << "done. Assigning strains to reads:\n";

  ifstream fin;
  fin.open(sdfile.c_str(), ios::in|ios::ate);
  if(!fin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << sdfile);
  }
  ProgressIndicator<streamsize> P(0, fin.tellg(),5000);
  fin.seekg(0, ios::beg);

  string readname, strain;
  uint32 numvals=0;
  while(GeneralIO::readKeyValue(fin, readname, strain)){
    if(P.delaytrigger()) P.progress(fin.tellg());
    if(!strain.empty()){
      rnI=rnmap.find(readname);
      if(rnI!=rnmap.end()){
	REP_thepool[rnI->second].setStrain(strain.c_str());
	// TODO: move this output to a log file
	//cout << readname << " has id " << rnI->second << " in readpool, setting strain to " << strain << endl;
      }else{
	//cout << readname << " from straindata not found in readpool.\n";
      }
    }
    numvals++;
  }
  fin.close();
  P.finishAtOnce();

  cout << "\nRead " << numvals << " straindata lines." << endl;

  uint nostrain=0;
  for(uint32 i=0; i<size();i++){
    if(REP_thepool[i].hasValidData()==true 
       && REP_thepool[i].getStrain().size()==0
       && !REP_thepool[i].isRail()
       && !REP_thepool[i].isBackbone()){
      ++nostrain;
    }
  }

  if(nostrain){
    cout << "Warning: after loading data from strain data file \"" << sdfile << "\",\n"
	 << nostrain << " reads have still no strain. Using empty strain there.\n";
    cout << "A list of reads without strain data can be made by analysing the"
      "\ncheckpoint file of MIRA in MAF format.\n";
  }

//#endif

  FUNCEND();
}






/*************************************************************************
 *
 * loads names of reads from external file and deletes them from pool
 * if invertselection true, then delets those not in the file
 *
 * BEWARE SIDE EFFECT: deletes all reads from pool that have invalid data
 *
 * the data must be in a key (a value in one line)
 * key can be: read name, or filename of exp read or file name of caf read
 *   (may not contain spaces, sorry)
 *
 * line with # as first nonwhitespace character are comments and read over
 *
 *************************************************************************/

void ReadPool::deleteReadsByName(const string & nfile, bool invertselection)
{
  FUNCSTART("void ReadPool::InvalidateReadsByName(const string & nfile)");

  //stringhash_t M;
  typedef boost::unordered_map<std::string, uint32> strintmap;
  strintmap rnmap;
  strintmap::iterator rnI;

  for(uint32 i=0; i<size();i++){
    if(!REP_thepool[i].getName().empty()) {
      rnmap[REP_thepool[i].getName()]=i;
    }
  }

  ifstream fin;
  fin.open(nfile.c_str(), ios::in|ios::ate);
  if(!fin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << nfile);
  }
  fin.seekg(0, ios::beg);

  if(invertselection){
    for(uint32 i=0; i<size();i++){
      REP_thepool[i].setValidData(false);
    }
  }

  string readname, dummy;
  uint32 numvals=0;
  while(GeneralIO::readKeyValue(fin, readname,dummy)){
    rnI=rnmap.find(readname);
    if(rnI!=rnmap.end()) {
      if(invertselection){
	REP_thepool[rnI->second].setValidData(true);
      }else{
	REP_thepool[rnI->second].setValidData(false);
      }
    }
  }
  fin.close();

  if(size()){
    for(int32 i=size()-1; i >= 0; i--){
      if(!REP_thepool[i].hasValidData()){
	vector<Read>::iterator I=REP_thepool.begin();
	advance(I,i);
	REP_thepool.erase(I);
      }
    }
  }

  FUNCEND();
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::mergeXMLTraceInfo(const string & xmlfile)
{
  FUNCSTART("void ReadPool::mergeXMLTraceInfo(const string & filename)");

  rpDateStamp();

  cout << "Merging data from XML trace info file " << xmlfile << " ...";
  cout.flush();

  //make this called!

  string id454="454";
  uint32 numfound=0;

  NCBIInfoXML nix;

  list<NCBIInfoXML::ncbitraceelements_t> traces;

  try{
    nix.readXMLFile(xmlfile, traces);
  }
  catch(Notify n) {
    cout << "\n\n\nMIRA tried to load a XML TRACEINFO file containing ancillary data, but failed.\n"
      "Loading ancillary data when using FASTA files as input is\n"
      "really,\n"
      "        really,\n"
      "                REALLY encouraged, and therefore MIRA sets this as default.\n"
      "\nHowever, if you are really sure that you do not want to load ancillary data\n"
      "in TRACEINFO files, you can switch it off.\n"
      "Either use '<technology>_SETTINGS -LR:mxti=no' (e.g. SANGER_SETTING -LR:mxti=no),\n"
      "or use the '-notraceinfo' quickswitch to kill loading traceinfo files for all\n"
      "types of sequencing technologies. (place it after -fasta and -job quickswitches)\n\n\n";
    n.handleError(THISFUNC);
  }


  cout << "Num reads: " << traces.size() << endl;

  typedef boost::unordered_map<std::string, int32> strintmap;
  strintmap rnmap;
  strintmap::iterator rnI;

  cout << "Building hash table ... "; cout.flush();

  for(uint32 i=0; i<size();i++){
    if(!REP_thepool[i].getName().empty()) {
      rnmap[REP_thepool[i].getName()]=i;
    }
  }
  cout << "done." << endl;

  string acttracename;
  list<uint32>::const_iterator E;
  list<string>::const_iterator ECD;

  list<NCBIInfoXML::ncbitraceelements_t>::const_iterator T=traces.begin();

  for(;T!=traces.end(); T++) {
    rnI=rnmap.end();

    E=T->elements.begin();
    ECD=T->elements_cdata.begin();
    bool found=false;
    for(;!found && E!=T->elements.end(); E++, ECD++) {
      if((*E == NCBIInfoXML::NCBIXML_TRACE_NAME
	  || *E == NCBIInfoXML::NCBIXML_TI)
	 && !ECD->empty()){
	if(*E == NCBIInfoXML::NCBIXML_TRACE_NAME){
	  acttracename=*ECD;
	} else if(*E == NCBIInfoXML::NCBIXML_TI){
	  acttracename="gnl|ti|"+*ECD;
	}
	rnI=rnmap.find(acttracename);

	if(rnI!=rnmap.end()){
	  numfound++;
	  found=true;
	}
      }
    }
    
    if(found){
      int32 idoffound=rnI->second;
      // cout << "Found " << REP_thepool[idoffound].getName() << endl; 
      // Read::setCoutType(Read::AS_TEXTCLIPS);
      // cout << REP_thepool[idoffound];
      
      int32 insertsize=-1;
      int32 insertstdev=-1;
      //int32 inssizemin=-1;
      //int32 inssizemax=-1;
      
      uint8 seqtype=Read::SEQTYPE_SANGER;
      
      E=T->elements.begin();
      ECD=T->elements_cdata.begin();
      for(;E!=T->elements.end(); E++, ECD++) {
	switch(*E) {
	case NCBIInfoXML::NCBIXML_TRACE_NAME : {
	  break;
	}
	case NCBIInfoXML::NCBIXML_TRACE_FILE : {
	  //if(!REP_thepool[idoffound].hasSCFData()) {
	  //set only when first load attempt failed?
	  
	  string path, filename;
	  splitFullPathAndFileName(*ECD, path, filename);
	  REP_thepool[idoffound].setSCFFileName(filename);
	  //REP_thepool[idoffound].setSCFDirectory(path);
	  //}
	  break;
	}
	case NCBIInfoXML::NCBIXML_TRACE_TYPE_CODE : {
	  if(*ECD == id454){
	    seqtype=Read::SEQTYPE_454GS20;
	  }
	  break;
	}
	case NCBIInfoXML::NCBIXML_CLIP_QUALITY_LEFT  : {
	  REP_thepool[idoffound].setLQClipoff(atoi(ECD->c_str()));
	  break;
	}
	case NCBIInfoXML::NCBIXML_CLIP_QUALITY_RIGHT  : {
	  REP_thepool[idoffound].setRQClipoff(atoi(ECD->c_str()));
	  break;
	}
	case NCBIInfoXML::NCBIXML_CLIP_VECTOR_LEFT  : {
	  REP_thepool[idoffound].setLSClipoff(atoi(ECD->c_str()));
	  break;
	}
	case NCBIInfoXML::NCBIXML_CLIP_VECTOR_RIGHT  : {
	  REP_thepool[idoffound].setRSClipoff(atoi(ECD->c_str()));
	  break;
	}
	case NCBIInfoXML::NCBIXML_INSERT_SIZE  : {
	  insertsize=atoi(ECD->c_str());
	  break;
	}
	case NCBIInfoXML::NCBIXML_INSERT_STDEV  : {
	  insertstdev=atoi(ECD->c_str());
	  break;
	}
//	case NCBIInfoXML::NCBIXML_INSERT_SIZE_MIN  : {
//	  inssizemin=atoi(ECD->c_str());
//	  if(inssizemax<0) inssizemax=inssizemin;
//	  break;
//	}
//	case NCBIInfoXML::NCBIXML_INSERT_SIZE_MAX  : {
//	  inssizemax=atoi(ECD->c_str());
//	  if(inssizemin<0) inssizemin=inssizemax;
//	  break;
//	}
	case NCBIInfoXML::NCBIXML_TEMPLATE_ID  : {
	  REP_thepool[idoffound].setTemplate(ECD->c_str());
	  //cout<< *ECD << endl;
	  break;
	}
	case NCBIInfoXML::NCBIXML_TRACE_END  : {
	  if(strlen(ECD->c_str())>0){
	    switch(toupper(ECD->c_str()[0])){
	    case 'F': // fall through
	    case 'R': {
	      REP_thepool[idoffound].setTemplateEnd(toupper(ECD->c_str()[0]));
	      break;
	    }
	    case 'U' : // fall through
	    case 'N' : {
	      REP_thepool[idoffound].setTemplateEnd('N');
	      break;
	    }
	    default : {
	      MIRANOTIFY(Notify::FATAL, "Illegal trace_end, it's not one of F/R/N(U) (or empty): " << ECD->c_str(););
	    }
	    }
	  }
	  break;
	}
	case NCBIInfoXML::NCBIXML_MACHINE_TYPE  : {
	  REP_thepool[idoffound].setMachineType(ECD->c_str());
	  //cout<< *ECD << endl;
	  //cout << REP_thepool[idoffound];
	  //abort();
	  break;
	}
	case NCBIInfoXML::NCBIXML_PROGRAM_ID  : {
	  REP_thepool[idoffound].setBasecaller(ECD->c_str());
	  //cout<< *ECD << endl;
	  break;
	}
	case NCBIInfoXML::NCBIXML_STRAIN  : {
	  REP_thepool[idoffound].setStrain(ECD->c_str());
	  //cout<< *ECD << endl;
	  break;
	}
	case NCBIInfoXML::NCBIXML_BASE_FILE  :
	case NCBIInfoXML::NCBIXML_QUAL_FILE  : {
	  break;
	}
	default : {
	  // Ooooops?
	}
	}
      }
      // cout << "After:\n";
      // cout << REP_thepool[idoffound];
      
      if(insertsize>0 || insertstdev>0) {
	if(insertsize<0) {
	  cout << "Read " << REP_thepool[idoffound].getName() << ": there is 'INSERT_STDEV' but no 'INSERT_SIZE' in the XML? This is suspicious ...\n";
	  insertsize=0;
	}
	if(insertstdev<0) {
	  cout << "Read " << REP_thepool[idoffound].getName() << ": there is 'INSERT_SIZE' but no 'INSERT_STDEV' in the XML? This is suspicious ...\n";
	  insertstdev=0;
	}
	int32 diff=insertstdev*3;  // "standard" used by most people
	int32 min=insertsize-diff;
	if(min<0)min=0;
	REP_thepool[idoffound].setInsize(min,
					 insertsize+diff);
	//cout << REP_thepool[idoffound].getName() << ": " << min << "\t" << insertsize+diff << endl;
      }
      if(seqtype==Read::SEQTYPE_454GS20){
	REP_thepool[idoffound].setSequencingType(seqtype);
      }
      
      if(seqtype == Read::SEQTYPE_SOLEXA){
	MIRANOTIFY(Notify::INTERNAL, "Type Solexa needs more support 6.");
      }
      if(seqtype == Read::SEQTYPE_ABISOLID){
	MIRANOTIFY(Notify::INTERNAL, "Type ABI SOLiD needs more support 6.");
      }
    }
  }

  makeTemplateIDs();

  cout << "Done merging XML data, matched " << numfound << " reads." << endl;

  rpDateStamp();

  FUNCEND();
  return;
}





/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
Read & ReadPool::addNewEmptyRead()
{
  FUNCSTART("Read & ReadPool::addNewEmptyRead()");

  REP_thepool.resize(REP_thepool.size()+1);
//  REP_thepool.back().setInsize(
//    (*REP_miraparams)[0].getContigParams().con_insertsize_minimum,
//    (*REP_miraparams)[0].getContigParams().con_insertsize_maximum);
//  REP_thepool.back().setReadNamingScheme(
//    (*REP_miraparams)[0].getAssemblyParams().as_readnaming_scheme);
//  REP_thepool.back().setQualities(
//    (*REP_miraparams)[0].getAssemblyParams().as_basedefaultqual);

  FUNCEND();
  return REP_thepool.back();

}





/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
Read & ReadPool::getRead(uint32 index)
{
  FUNCSTART("Read & ReadPool::getRead(uint32 index)");

  if(index >= REP_thepool.size()){
    MIRANOTIFY(Notify::INTERNAL, "error: index (" << index << ") >= REP_thepool.size() (" << REP_thepool.size() << ") ?");
  }

  FUNCEND();

  return REP_thepool[index];
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
Read & ReadPool::getRead(const string & name)
{
  return REP_thepool[getReadIndex(name)];
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
int32 ReadPool::getReadIndex(const string & name) const
{
  FUNCSTART("Read & ReadPool::getRead(const string & name)");

  if(name.empty()){
    MIRANOTIFY(Notify::INTERNAL, "tried to search empty name") ;
  }

  vector<Read>::const_iterator I=REP_thepool.begin();

  int32 i=0;
  while(I!=REP_thepool.end()){
    if(name==I->getName()) break;
    I++; i++;
  }

  if(I==REP_thepool.end()){
    MIRANOTIFY(Notify::WARNING,"Could not find read " << name << " in readpool.");
  }

  FUNCEND();

  return i;
}

/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
int32 ReadPool::getReadIndex(const char * name) const
{
  FUNCSTART("Read & ReadPool::getRead(const char * name)");

  if(name==NULL && *name==0) {
    MIRANOTIFY(Notify::INTERNAL, "tried to search empty name") ;
  }

  vector<Read>::const_iterator I=REP_thepool.begin();

  int32 i=0;
  while(I!=REP_thepool.end()){
    if(strcmp(name,I->getName().c_str()) == 0) break;
    I++; i++;
  }

  if(I==REP_thepool.end()){
    MIRANOTIFY(Notify::WARNING,"Could not find read " << name << " in readpool.");
  }

  FUNCEND();

  return i;
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

bool ReadPool::isInPool(const string & name) const
{
  try{
    getReadIndex(name);
    return true;
  }
  catch (...){
    return false;
  }
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::dumpAs(ostream & ostr, uint8 astype, bool alsoinvalids) const
{
  FUNCSTART("void ReadPool::dumpAs(ostream & ostr, uint8 astype, bool alsoinvalids) const)");

  Read::setCoutType(astype);
  for(uint32 i=0; i<REP_thepool.size(); i++){
    if(REP_thepool[i].hasValidData() || alsoinvalids) ostr << REP_thepool[i];
  }

  FUNCEND();
}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::dumpPoolInfo(ostream & ostr) const
{
  FUNCSTART("void ReadPool::dumpPoolInfo(ostream & ostr)");
  
  for(uint32 i=0; i<REP_thepool.size(); i++){
    if(REP_thepool[i].hasValidData()) {
      ostr << i << '\t' << REP_thepool[i].getName() << '\n';
    }else{
      ostr << i << "\tinvalid\n";
    }
  } 
  FUNCEND();
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
void ReadPool::dumpAsEXPs(string & dirname) const
{
  FUNCSTART("void ReadPool::dumpAsEXPs(string & dirname) const");

  if(ensureDirectory(dirname,true)){
    MIRANOTIFY(Notify::FATAL, "Could not make sure that directory '" << dirname << "' exists, aborting MIRA.");
  }

  ofstream fofnout((dirname+"/fofn").c_str(), ios::out | ios::trunc);

  string dummyAP="";
  for(uint32 i=0; i<REP_thepool.size(); i++){
    if(REP_thepool[i].hasValidData()) {
      ofstream expout((dirname+"/"+REP_thepool[i].getName()+".exp").c_str(), ios::out | ios::trunc);
      (const_cast<Read &>(REP_thepool[i])).dumpAsGAP4DA(expout, dummyAP);
      
      expout.close();

      fofnout << REP_thepool[i].getName() << ".exp" << endl;
    }
  }

  fofnout.close();

  FUNCEND();
}


