libMems/IntervalList.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: GenericIntervalList.h,v 1.6 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef _IntervalList_h_
00010 #define _IntervalList_h_
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include <iostream>
00017 #include <list>
00018 #include <sstream>
00019 
00020 #include "libMems/SortedMerList.h"
00021 #include "libGenome/gnSequence.h"
00022 #include "libMems/Interval.h"
00023 #include "libMems/MemHash.h"
00024 #include "libMems/CompactGappedAlignment.h"
00025 #include "libGenome/gnSourceFactory.h"
00026 #include "libGenome/gnFASSource.h"
00027 #include "libGenome/gnSEQSource.h"
00028 #include "libGenome/gnGBKSource.h"
00029 #include "libGenome/gnRAWSource.h"
00030 
00031 namespace mems {
00032 
00038 template< class MatchType = Interval >
00039 class GenericIntervalList : public std::vector< MatchType > {
00040 public:
00041         GenericIntervalList(){};
00042         GenericIntervalList( const GenericIntervalList& ml );
00043         GenericIntervalList& operator=( const GenericIntervalList& ml );
00044         ~GenericIntervalList(){};
00045         
00050         void Clear();
00051 
00060         void ReadList( std::istream& match_stream );
00061 
00066         void WriteList( std::ostream& match_stream ) const;
00067         
00071         void WriteAlignedSequences(std::ostream& match_file) const;
00072         
00076         void WriteStandardAlignment( std::ostream& out_file ) const;
00077         
00081         void ReadStandardAlignment( std::istream& in_stream );
00082 
00087         void ReadStandardAlignmentCompact( std::istream& in_stream );
00088         
00089         std::vector<std::string> seq_filename;  
00090         std::vector<genome::gnSequence*> seq_table;     
00092         std::string backbone_filename;  
00093 protected:
00094 
00095 };
00096 
00097 
00098 typedef GenericIntervalList<> IntervalList;
00099 
00100 template< class MatchType >
00101 GenericIntervalList<MatchType>::GenericIntervalList( const GenericIntervalList<MatchType>& ml )
00102 {
00103         *this = ml;
00104 }
00105 
00106 template< class MatchType >
00107 GenericIntervalList<MatchType>& GenericIntervalList<MatchType>::operator=( const GenericIntervalList<MatchType>& ml )
00108 {
00109         std::vector< MatchType >::operator=( ml );
00110         seq_filename = ml.seq_filename;
00111         seq_table = ml.seq_table;
00112         return *this;
00113 }
00114 
00115 template< class MatchType >
00116 void GenericIntervalList<MatchType>::Clear() 
00117 {
00118         for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
00119                 if( seq_table[ seqI ] != NULL )
00120                         delete seq_table[ seqI ];
00121         }
00122         seq_filename.clear();
00123         this->clear();
00124 }
00125 
00126 template< class MatchType >
00127 void GenericIntervalList<MatchType>::ReadList(std::istream& match_file)
00128 {
00129         std::string tag;
00130         gnSeqI len;
00131         int64 start;
00132         unsigned int seq_count;
00133         uint seqI;
00134         
00135         match_file >> tag;      //format version tag
00136         if( tag != "FormatVersion" ){
00137                 Throw_gnEx(InvalidFileFormat());
00138         }
00139         match_file >> tag;      //format version
00140         if( tag != "4" ){
00141                 Throw_gnEx(InvalidFileFormat());
00142         }
00143         match_file >> tag;      //sequence count tag
00144         if( tag != "SequenceCount" ){
00145                 Throw_gnEx(InvalidFileFormat());
00146         }
00147         match_file >> seq_count;        //sequence count
00148         if(seq_count < 2){
00149                 Throw_gnEx(InvalidFileFormat());
00150         }
00151         
00152         std::vector< std::string > alignment;
00153         // read the sequence file names and lengths
00154         for( seqI = 0; seqI < seq_count; seqI++ ){
00155                 match_file >> tag;      // name tag
00156                 getline( match_file, tag );
00157                 // skip the tab character
00158                 tag = tag.substr( 1 );
00159                 seq_filename.push_back(tag);
00160 //              try{
00161 //                      gnSequence *new_seq = new gnSequence();
00162 //                      new_seq->LoadSource(tag);
00163 //                      seq_table.push_back( new_seq );
00164 //              }catch( gnException& gne );
00165                 match_file >> tag;      // length tag
00166                 match_file >> tag;      // length
00167 
00168                 alignment.push_back( "" );      // initialize alignment vector
00169         }
00170         uint interval_count;
00171         match_file >> tag;      // interval count tag
00172         match_file >> interval_count;   // interval count
00173         
00174         
00175         // read the matches
00176         std::string cur_line;
00177         Interval* cur_iv = NULL;
00178         boolean clustal_match;
00179         std::vector< AbstractMatch* > iv_matches;
00180         bool parsing = false;
00181         
00182         while( std::getline( match_file, cur_line ) ){
00183                 if( cur_line.find( "Interval" ) != std::string::npos ){
00184                         // end the old interval
00185                         if( iv_matches.size() > 0 )
00186                         {
00187                                 this->push_back( Interval(iv_matches.begin(), iv_matches.end()) );
00188 //                              for( size_t mI = 0; mI < iv_matches.size(); mI++ )
00189 //                                      delete iv_matches[mI];
00190                                 iv_matches.clear();
00191                         }
00192                         parsing = true;
00193                         continue;
00194                 }
00195                 if( !parsing )
00196                         continue;
00197                 if( cur_line.length() == 0 )
00198                         continue;
00199                 
00200                 clustal_match = false;
00201                 if( cur_line == "GappedAlignment" ){
00202                         clustal_match = true;
00203                         getline( match_file, cur_line );
00204 
00205                         std::stringstream line_stream( cur_line );
00206                         line_stream >> len;
00207                         GappedAlignment* cr = new GappedAlignment( seq_count, len );
00208                         
00209                         for( seqI = 0; seqI < seq_count; seqI++ ){
00210                                 line_stream >> start;
00211                                 cr->SetStart( seqI, start );
00212                                 std::getline( match_file, alignment[ seqI ] );
00213                                 int64 seq_len = 0;
00214                                 for( uint charI = 0; charI < alignment[ seqI ].length(); charI++ )
00215                                         if( alignment[ seqI ][ charI ] != '-' )
00216                                                 seq_len++;
00217                                 cr->SetLength( seq_len, seqI );
00218                         }
00219                         cr->SetAlignment( alignment );
00220                         iv_matches.push_back( cr );
00221                 }
00222                 else{
00223 
00224                         Match mmhe( seq_count );
00225                         Match* mhe = mmhe.Copy();
00226                         std::stringstream line_stream( cur_line );
00227                         
00228                         line_stream >> len;
00229                         mhe->SetLength(len);
00230 
00231                         for( seqI = 0; seqI < seq_count; seqI++){
00232                                 line_stream >> start;
00233                                 mhe->SetStart(seqI, start);
00234                         }
00235                 
00236                         iv_matches.push_back( mhe );
00237                 }
00238         }
00239         if( iv_matches.size() > 0 )
00240                 this->push_back( Interval(iv_matches.begin(), iv_matches.end()) );
00241         if( interval_count != this->size() ){
00242                 Throw_gnEx(InvalidFileFormat());
00243         }
00244         
00245 }
00246 
00247 template< class MatchType >
00248 void GenericIntervalList<MatchType>::WriteList(std::ostream& match_file) const
00249 {
00250 
00251         unsigned int seq_count = seq_table.size();
00252         uint seqI;
00253         
00254         match_file << "FormatVersion" << '\t' << 4 << "\n";
00255         match_file << "SequenceCount" << '\t' << seq_count << "\n";
00256         for(seqI = 0; seqI < seq_count; seqI++){
00257                 match_file << "Sequence" << seqI << "File" << '\t';
00258                 if( seq_filename.size() > seqI )
00259                         match_file << seq_filename[seqI];
00260                 else
00261                         match_file << "null";
00262                 match_file << "\n";
00263                 match_file << "Sequence" << seqI << "Length" << '\t';
00264                 if( seq_table.size() > seqI )
00265                         match_file << seq_table[seqI]->length();
00266                 else
00267                         match_file << "0";
00268                 match_file << "\n";
00269         }
00270 
00271         match_file << "IntervalCount" << '\t' << this->size() << std::endl;
00272         
00273         for( uint ivI = 0; ivI < this->size(); ivI++ ){
00274                 match_file << "Interval " << ivI << std::endl;
00275                 const std::vector<AbstractMatch*>& matches = (*this)[ ivI ].GetMatches();
00276                 for( uint matchI = 0; matchI < matches.size(); matchI++ ){
00277                         const AbstractMatch* m = matches[ matchI ];
00278                         const GappedAlignment* cr = dynamic_cast< const GappedAlignment* >( m );
00279                         const Match* match = dynamic_cast< const Match* >( m );
00280                         if( match != NULL ){
00281                                 match_file << *match << std::endl;
00282                         }
00283                         else if( cr != NULL ){
00284                                 match_file << "GappedAlignment\n";
00285                                 match_file << cr->Length();
00286                                 for( seqI = 0; seqI < seq_count; seqI++ )
00287                                         match_file << '\t' << cr->Start( seqI );
00288                                 match_file << std::endl;
00289 
00290                                 const std::vector< std::string >& align_matrix = GetAlignment( *cr, seq_table );
00291                                 for( seqI = 0; seqI < seq_count; seqI++ )
00292                                         match_file << align_matrix[ seqI ] << std::endl;
00293                         }
00294                 }
00295                 match_file << std::endl;
00296         }
00297 }
00298 
00299 template< class MatchType >
00300 void GenericIntervalList<MatchType>::WriteStandardAlignment( std::ostream& out_file ) const 
00301 {
00302         if( this->size() == 0 )
00303                 return;
00304 
00305 //      unsigned int seq_count = seq_table.size();
00306         uint seqI = 0;
00307         
00308         // write out the format version
00309         out_file << "#FormatVersion Mauve1" << std::endl;
00310         
00311         // write source sequence filenames and formats
00312         // to make Paul happy
00313         boolean single_input = true;
00314         for( seqI = 1; seqI < seq_filename.size(); seqI++ ){
00315                 if( seq_filename[ 0 ] != seq_filename[ seqI ] ){
00316                         single_input = false;
00317                         break;
00318                 }
00319         }
00320         for( seqI = 0; seqI < seq_filename.size(); seqI++ ){
00321                 out_file << "#Sequence" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
00322                 if( single_input )
00323                         out_file << "#Sequence" << seqI + 1 << "Entry\t" << seqI + 1 << std::endl;
00324                 
00325                 genome::gnSourceFactory* sf = genome::gnSourceFactory::GetSourceFactory();
00326                 genome::gnBaseSource* gnbs = sf->MatchSourceClass( seq_filename[ seqI ] );
00327                 genome::gnFASSource* gnfs = dynamic_cast< genome::gnFASSource* >(gnbs);
00328                 genome::gnRAWSource* gnrs = dynamic_cast< genome::gnRAWSource* >(gnbs);
00329                 genome::gnSEQSource* gnss = dynamic_cast< genome::gnSEQSource* >(gnbs);
00330                 genome::gnGBKSource* gngs = dynamic_cast< genome::gnGBKSource* >(gnbs);
00331                 if( gnfs != NULL )
00332                         out_file << "#Sequence" << seqI + 1 << "Format\tFastA" << std::endl;
00333                 else if( gnrs != NULL )
00334                         out_file << "#Sequence" << seqI + 1 << "Format\traw" << std::endl;
00335                 else if( gnss != NULL ){
00336                         out_file << "#Sequence" << seqI + 1 << "Format\tDNAstar" << std::endl;
00337                         out_file << "#Annotation" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
00338                         out_file << "#Annotation" << seqI + 1 << "Format\tDNAstar" << std::endl;
00339                 }else if( gngs != NULL ){
00340                         out_file << "#Sequence" << seqI + 1 << "Format\tGenBank" << std::endl;
00341                         out_file << "#Annotation" << seqI + 1 << "File\t" << seq_filename[ seqI ] << std::endl;
00342                         out_file << "#Annotation" << seqI + 1 << "Format\tGenBank" << std::endl;
00343                 }
00344         }
00345 
00346         if( this->backbone_filename != "" )
00347                 out_file << "#BackboneFile\t" << this->backbone_filename << std::endl;
00348         
00349         for( uint ivI = 0; ivI < this->size(); ivI++ ){
00350                 if( (*this)[ ivI ].AlignmentLength() == 0 ){
00351                         continue;
00352                 }
00353                 std::vector<std::string> alignment;
00354                 if( seq_table.size() == 1 && seq_table.size() != (*this)[ ivI ].SeqCount() )
00355                 {
00356                         GetAlignment( (*this)[ ivI ], std::vector<genome::gnSequence*>((*this)[ ivI ].SeqCount(), seq_table[0]), alignment );
00357                 }else
00358                         GetAlignment( (*this)[ ivI ], seq_table, alignment );
00359                 for( seqI = 0; seqI < (*this)[ ivI ].SeqCount(); seqI++ ){
00360                         int64 startI = (*this)[ ivI ].Start( seqI );
00361                         gnSeqI length = (*this)[ ivI ].Length( seqI );
00362                         // if this genome doesn't have any sequence in this
00363                         // interval then skip it...
00364                         if( startI == 0 &&ivI > 0)      // kludge: write all seqs into the first interval so java parser can read it
00365                                 continue;
00366                         out_file << "> " << seqI + 1 << ":";
00367                         if( startI > 0 ){
00368                                 out_file << genome::absolut( startI ) << "-" << genome::absolut( startI ) + length - 1 << " + ";
00369                         }else if(startI == 0){
00370                                 out_file << 0 << "-" << 0 << " + ";
00371                         }else{
00372                                 out_file << genome::absolut( startI ) << "-" << genome::absolut( startI ) + length - 1 << " - ";
00373                         }
00374                         if( single_input )
00375                                 out_file << seq_filename[ 0 ];  // write the sequence filename as the seq name
00376                         else                            
00377                                 out_file << seq_filename[ seqI ];       // write the sequence filename as the seq name
00378                         out_file << std::endl;
00379                         gnSeqI cur_pos = 0;
00380                         for( ; cur_pos < alignment[ seqI ].length(); cur_pos += 80 ){
00381                                 gnSeqI cur_len = cur_pos + 80 < alignment[ seqI ].length() ? 80 : alignment[ seqI ].length() - cur_pos;
00382                                 out_file.write( alignment[ seqI ].data() + cur_pos, cur_len );
00383                                 out_file << std::endl;
00384                         }
00385                 }
00386                 out_file << "=" << std::endl;
00387         }
00388         
00389 }
00390 
00391 template< class MatchType >
00392 void GenericIntervalList<MatchType>::ReadStandardAlignment( std::istream& in_stream ) 
00393 {
00394         uint seq_count = 0;
00395         gnSeqI max_len = 0;
00396         std::string cur_line;
00397         if( !std::getline( in_stream, cur_line ) )
00398         {
00399                 Clear();        // if we can't read from the file then just return an empty interval list
00400                 return;
00401         }
00402         uint seqI = 0;
00403         std::vector< gnSeqI > lengths;
00404         std::vector< GappedAlignment* > ga_list;
00405         GappedAlignment cr;
00406         std::string empty_line;
00407         std::vector< std::string > aln_mat;
00408         uint line_count = 1;
00409         while( true ){
00410                 
00411                 while( cur_line[0] == '#' ){
00412                         // hit a comment or metadata.  try to parse it if it's a filename
00413                         std::getline( in_stream, cur_line );
00414                         line_count++;
00415                         std::stringstream ss( cur_line );
00416                         std::string token;
00417                         std::getline( ss, token, '\t' );
00418                         if( token.substr(1, 8) != "Sequence" || token.find( "File" ) == std::string::npos )
00419                                 continue;
00420                         std::getline( ss, token );
00421                         seq_filename.push_back( token );
00422                 }
00423                 
00424                 // read and parse the def. line
00425                 std::stringstream line_str( cur_line );
00426                 std::getline( line_str, cur_line, '>' );
00427                 std::getline( line_str, cur_line, ':' );
00428                 // take off leading whitespace
00429                 std::stringstream parse_str( cur_line );
00430 
00431                 parse_str >> seqI;      // the sequence number
00432                                 
00433                 int64 start, stop;
00434                 std::getline( line_str, cur_line, '-' );
00435                 parse_str.clear();
00436                 parse_str.str( cur_line );
00437                 parse_str >> start;
00438                 line_str >> stop;
00439                 std::string strand;
00440                 line_str >> strand;
00441 
00442                 std::string name;       // anything left is the name
00443                 std::getline( line_str, name );
00444 
00445                 // read and parse the sequence
00446                 while( aln_mat.size() < seqI )
00447                         aln_mat.push_back( empty_line );
00448 
00449                 gnSeqI chars = 0;
00450                 while( std::getline( in_stream, cur_line ) ){
00451                         line_count++;
00452                         if( (cur_line[ 0 ] == '>' ) || (cur_line[ 0 ] == '=' ))
00453                                 break;
00454                         for( uint charI = 0; charI < cur_line.length(); charI++ )
00455                                 if( cur_line[ charI ] != '-' )
00456                                         chars++;
00457                         aln_mat[ seqI - 1 ] += cur_line;
00458                 }
00459                 while( lengths.size() < seqI )
00460                         lengths.push_back(0);
00461 
00462                 lengths[ seqI - 1 ] = chars;
00463 
00464 // temporary workaround for file format inconsistency
00465                 if( strand == "+" )
00466                         cr.SetStart( seqI - 1, start );
00467                 else if( start < stop ){
00468                         if( chars == 0 )
00469                                 cr.SetStart( seqI - 1, 0 );
00470                         else
00471                                 cr.SetStart( seqI - 1, -start );
00472                         if( chars != stop - start + 1 && !(chars == 0 && stop - start == 1) ){
00473                                 std::cerr << "Error in XMFA file format\n";
00474                                 std::cerr << "Before line " << line_count << std::endl;
00475                                 std::cerr << "Expecting " << stop - start + 1 << " characters based on defline\n";
00476                                 std::cerr << "Actually read " << chars << " characters of sequence\n";
00477                                 Throw_gnEx(InvalidFileFormat());
00478                         }
00479                 }else{
00480                         if( chars == 0 )
00481                                 cr.SetStart( seqI - 1, 0 );
00482                         else
00483                                 cr.SetStart( seqI - 1, -stop );
00484                         if( chars != start - stop + 1 && !(chars == 0 && stop - start == 1) ){
00485                                 std::cerr << "Error in XMFA file format\n";
00486                                 std::cerr << "Before line " << line_count << std::endl;
00487                                 std::cerr << "Expecting " << start - stop + 1 << " characters based on defline\n";
00488                                 std::cerr << "Actually read " << chars << " characters of sequence\n";
00489                                 Throw_gnEx(InvalidFileFormat());
00490                         }
00491                 }
00492 
00493                 if( chars > max_len )
00494                         max_len = aln_mat[ seqI - 1 ].length();
00495                         
00496                 if( cur_line.size() == 0 )
00497                         break;
00498                 // did we finish an aligned region?
00499                 if( cur_line[ 0 ] != '>' ){
00500                         GappedAlignment *new_cr = new GappedAlignment( aln_mat.size(), max_len );
00501                         for( uint seqJ = 0; seqJ < seqI; seqJ++ ){
00502                                 new_cr->SetStart( seqJ, cr.Start( seqJ ) );
00503                                 new_cr->SetLength( lengths[ seqJ ], seqJ );
00504                                 cr.SetStart( seqJ, NO_MATCH );
00505                         }
00506                         for( uint seqJ = 0; seqJ < seqI; seqJ++ )
00507                                 aln_mat[seqJ].resize( max_len, '-' );
00508 
00509                         new_cr->SetAlignment(aln_mat);
00510                         lengths.clear();
00511                         if( seq_count < seqI )
00512                                 seq_count = seqI;
00513 
00514                         ga_list.push_back( new_cr );
00515 
00516                         max_len = 0;    // reset length for the next interval
00517                         aln_mat.clear();        // reset cr for next interval
00518 
00519                         // bail out on EOF or corruption
00520                         if( cur_line[ 0 ] != '=' )
00521                                 break;
00522                         // otherwise read up to the next def. line
00523                         while( std::getline( in_stream, cur_line ) ){
00524                                 line_count++;
00525                                 if( cur_line[ 0 ] == '>' )
00526                                         break;
00527                         }
00528                         if( cur_line[ 0 ] != '>' )
00529                                 break;
00530                 }
00531         }
00532 
00533         // now process all GappedAlignments into Intervals
00534         for( uint ivI = 0; ivI < ga_list.size(); ivI++ ){
00535                 GappedAlignment* cr = ga_list[ ivI ];
00536                 GappedAlignment* new_cr = new GappedAlignment( seq_count, cr->AlignmentLength() );
00537 
00538                 const std::vector< std::string >& align_matrix = GetAlignment( *cr, seq_table );
00539                 std::vector< std::string > new_aln_mat(seq_count);
00540                 for( seqI = 0; seqI < align_matrix.size(); seqI++ ){
00541                         new_cr->SetLength( cr->Length( seqI ), seqI );
00542                         new_cr->SetStart( seqI, cr->Start(seqI) );
00543                         new_aln_mat[ seqI ] = align_matrix[ seqI ];
00544                         if( new_aln_mat[ seqI ].length() == 0 )
00545                                 new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
00546                 }
00547                 for( ; seqI < seq_count; seqI++ ){
00548                         new_cr->SetLength( 0, seqI );
00549                         new_cr->SetStart( seqI, 0 );
00550                         new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
00551                 }
00552                 new_cr->SetAlignment(new_aln_mat);
00553                 delete cr;
00554                 cr = new_cr;
00555                 ga_list[ ivI ] = new_cr;
00556 
00557                 std::vector<AbstractMatch*> asdf(1, cr);
00558                 Interval iv( asdf.begin(), asdf.end() );
00559                 this->push_back( iv );
00560         }
00561 }
00562 
00563 template< class MatchType >
00564 void GenericIntervalList<MatchType>::ReadStandardAlignmentCompact( std::istream& in_stream ) 
00565 {
00566         uint seq_count = 0;
00567         gnSeqI max_len = 0;
00568         std::string cur_line;
00569         std::getline( in_stream, cur_line );
00570         uint seqI = 0;
00571         std::vector< gnSeqI > lengths;
00572         std::vector< GappedAlignment* > ga_list;
00573         GappedAlignment cr;
00574         std::string empty_line;
00575         std::vector< std::string > aln_mat;
00576         uint line_count = 1;
00577         while( true ){
00578                 
00579                 while( cur_line[0] == '#' ){
00580                         // hit a comment or metadata.  try to parse it if it's a filename
00581                         std::getline( in_stream, cur_line );
00582                         line_count++;
00583                         std::stringstream ss( cur_line );
00584                         std::string token;
00585                         std::getline( ss, token, '\t' );
00586                         if( token.substr(1, 8) != "Sequence" || token.find( "File" ) == std::string::npos )
00587                                 continue;
00588                         std::getline( ss, token );
00589                         seq_filename.push_back( token );
00590                 }
00591                 
00592                 // read and parse the def. line
00593                 std::stringstream line_str( cur_line );
00594                 std::getline( line_str, cur_line, '>' );
00595                 std::getline( line_str, cur_line, ':' );
00596                 // take off leading whitespace
00597                 std::stringstream parse_str( cur_line );
00598 
00599                 parse_str >> seqI;      // the sequence number
00600                                 
00601                 int64 start, stop;
00602                 std::getline( line_str, cur_line, '-' );
00603                 parse_str.clear();
00604                 parse_str.str( cur_line );
00605                 parse_str >> start;
00606                 line_str >> stop;
00607                 std::string strand;
00608                 line_str >> strand;
00609 
00610                 std::string name;       // anything left is the name
00611                 std::getline( line_str, name );
00612 
00613                 // read and parse the sequence
00614                 while( aln_mat.size() < seqI )
00615                         aln_mat.push_back( empty_line );
00616 
00617                 gnSeqI chars = 0;
00618                 while( std::getline( in_stream, cur_line ) ){
00619                         line_count++;
00620                         if( (cur_line[ 0 ] == '>' ) || (cur_line[ 0 ] == '=' ))
00621                                 break;
00622                         for( uint charI = 0; charI < cur_line.length(); charI++ )
00623                                 if( cur_line[ charI ] != '-' )
00624                                         chars++;
00625                         aln_mat[ seqI - 1 ] += cur_line;
00626                 }
00627                 while( lengths.size() < seqI )
00628                         lengths.push_back(0);
00629 
00630                 lengths[ seqI - 1 ] = chars;
00631 
00632 // temporary workaround for file format inconsistency
00633                 if( strand == "+" )
00634                         cr.SetStart( seqI - 1, start );
00635                 else if( start < stop ){
00636                         if( chars == 0 )
00637                                 cr.SetStart( seqI - 1, 0 );
00638                         else
00639                                 cr.SetStart( seqI - 1, -start );
00640                         if( chars != stop - start + 1 && !(chars == 0 && stop - start == 1) ){
00641                                 std::cerr << "Error in XMFA file format\n";
00642                                 std::cerr << "Before line " << line_count << std::endl;
00643                                 std::cerr << "Expecting " << stop - start + 1 << " characters based on defline\n";
00644                                 std::cerr << "Actually read " << chars << " characters of sequence\n";
00645                                 Throw_gnEx(InvalidFileFormat());
00646                         }
00647                 }else{
00648                         if( chars == 0 )
00649                                 cr.SetStart( seqI - 1, 0 );
00650                         else
00651                                 cr.SetStart( seqI - 1, -stop );
00652                         if( chars != start - stop + 1 && !(chars == 0 && stop - start == 1) ){
00653                                 std::cerr << "Error in XMFA file format\n";
00654                                 std::cerr << "Before line " << line_count << std::endl;
00655                                 std::cerr << "Expecting " << start - stop + 1 << " characters based on defline\n";
00656                                 std::cerr << "Actually read " << chars << " characters of sequence\n";
00657                                 Throw_gnEx(InvalidFileFormat());
00658                         }
00659                 }
00660 
00661                 if( chars > max_len )
00662                         max_len = aln_mat[ seqI - 1 ].length();
00663                         
00664                 if( cur_line.size() == 0 )
00665                         break;
00666                 // did we finish an aligned region?
00667                 if( cur_line[ 0 ] != '>' ){
00668                         GappedAlignment *new_cr = new GappedAlignment( aln_mat.size(), max_len );
00669                         for( uint seqJ = 0; seqJ < seqI; seqJ++ ){
00670                                 new_cr->SetStart( seqJ, cr.Start( seqJ ) );
00671                                 new_cr->SetLength( lengths[ seqJ ], seqJ );
00672                                 cr.SetStart( seqJ, NO_MATCH );
00673                         }
00674                         for( uint seqJ = 0; seqJ < seqI; seqJ++ )
00675                                 aln_mat[seqJ].resize( max_len, '-' );
00676 
00677                         new_cr->SetAlignment(aln_mat);
00678                         lengths.clear();
00679                         if( seq_count < seqI )
00680                                 seq_count = seqI;
00681 
00682                         ga_list.push_back( new_cr );
00683 
00684                         max_len = 0;    // reset length for the next interval
00685                         aln_mat.clear();        // reset cr for next interval
00686 
00687                         // bail out on EOF or corruption
00688                         if( cur_line[ 0 ] != '=' )
00689                                 break;
00690                         // otherwise read up to the next def. line
00691                         while( std::getline( in_stream, cur_line ) ){
00692                                 line_count++;
00693                                 if( cur_line[ 0 ] == '>' )
00694                                         break;
00695                         }
00696                         if( cur_line[ 0 ] != '>' )
00697                                 break;
00698                 }
00699         }
00700 
00701         // now process all GappedAlignments into Intervals
00702         //cerr << "Stuffing all GappedAlignments into Intervals" << endl;
00703         for( uint ivI = 0; ivI < ga_list.size(); ivI++ )
00704         {       
00705                 GappedAlignment* cr = ga_list[ ivI ];
00706                 uint compact_seq_count =  cr->SeqCount();
00707                 CompactGappedAlignment<>* new_cr = new CompactGappedAlignment<>(compact_seq_count, cr->AlignmentLength() );
00708                 const std::vector< std::string > align_matrix = GetAlignment( *cr, seq_table );
00709                 //cout << cr->SeqCount() << " " << seq_count << " "  << align_matrix.size() << endl;
00710                 
00711                 std::vector< std::string > new_aln_mat(compact_seq_count);
00712                 for( seqI = 0; seqI < compact_seq_count; seqI++ ){
00713                         new_cr->SetLength( cr->Length( seqI ), seqI );
00714                         new_cr->SetStart( seqI, cr->Start(seqI) );
00715                         new_aln_mat[ seqI ] = align_matrix[ seqI ];
00716                         if( new_aln_mat[ seqI ].length() == 0 )
00717                                 new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
00718                 }
00719                 
00720                 for( ; seqI < compact_seq_count; seqI++ ){
00721                         new_cr->SetLength( 0, seqI );
00722                         new_cr->SetStart( seqI, 0 );
00723                         new_aln_mat[ seqI ] = std::string( new_cr->AlignmentLength(), '-' );
00724                 }
00725                 
00726                 new_cr->SetAlignment( new_aln_mat );
00727                 delete cr;
00728 
00729                 //CompactGappedAlignment<>* cga =  new_cr;
00730                 //ga_list[ ivI ] = dynamic_cast<GappedAlignment*>(cga);
00731                 Interval iv;
00732                 this->push_back( iv );
00733                 std::vector< AbstractMatch* > matches(1, new_cr);
00734                 this->back().SetMatches( matches );
00735         }
00736 }
00737 
00738 
00739 template< class MatchType >
00740 void GenericIntervalList<MatchType>::WriteAlignedSequences(std::ostream& match_file) const
00741 {
00742 
00743         unsigned int seq_count = seq_table.size();
00744         uint seqI;
00745         
00746         match_file << "mauveAligner data\n";
00747         match_file << "FormatVersion" << '\t' << 5 << "\n";
00748         match_file << "SequenceCount" << '\t' << seq_count << "\n";
00749         for(seqI = 0; seqI < seq_count; seqI++){
00750                 match_file << "Sequence" << seqI << "File" << '\t';
00751                 if( seq_filename.size() > seqI )
00752                         match_file << seq_filename[seqI];
00753                 else
00754                         match_file << "null";
00755                 match_file << "\n";
00756                 match_file << "Sequence" << seqI << "Length" << '\t';
00757                 if( seq_table.size() > seqI )
00758                         match_file << seq_table[seqI]->length();
00759                 else
00760                         match_file << "0";
00761                 match_file << "\n";
00762         }
00763 
00764         match_file << "AlignmentCount" << '\t' << this->size() << std::endl;
00765 
00766         if( this->size() == 0 )
00767                 return;
00768         
00769         for( uint ivI = 0; ivI < this->size(); ivI++ ){
00770                 
00771                 match_file << (*this)[ ivI ].AlignmentLength();
00772                 for( seqI = 0; seqI < seq_count; seqI++ )
00773                         match_file << '\t' << (*this)[ ivI ].Start( seqI );
00774                 match_file << std::endl;
00775 
00776                 std::vector<std::string> alignment;
00777                 GetAlignment( (*this)[ ivI ], this->seq_table, alignment );
00778                 for( seqI = 0; seqI < seq_count; seqI++ )
00779                         match_file << alignment[ seqI ] << std::endl;
00780                 match_file << std::endl;
00781         }
00782         
00783 }
00784 
00785 
00786 }
00787 
00788 #endif  //_IntervalList_h_

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6