00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef _MatchList_h_
00010 #define _MatchList_h_
00011
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015
00016 #include <iostream>
00017 #include <list>
00018 #include "libMems/SortedMerList.h"
00019 #include "libMems/DNAFileSML.h"
00020 #include "libMems/DNAMemorySML.h"
00021 #include "libGenome/gnSequence.h"
00022 #include "libMems/Match.h"
00023 #include "libMems/gnRAWSequence.h"
00024 #include "libGenome/gnRAWSource.h"
00025 #include "libMems/Files.h"
00026 #include <sstream>
00027 #include <map>
00028 #include <ctime>
00029
00030 namespace mems {
00031
00032 template< typename MatchPtrType >
00033 class GenericMatchList : public std::vector< MatchPtrType >
00034 {
00035 public:
00036 GenericMatchList(){};
00037 GenericMatchList( const GenericMatchList& ml );
00038 GenericMatchList& operator=( const GenericMatchList& ml );
00039
00040
00051 void LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0 );
00052
00065 void CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0 );
00066
00071 static uint GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table );
00072
00077 void Clear();
00078
00083 void MultiplicityFilter( unsigned mult );
00084
00089 void LengthFilter( gnSeqI length );
00090
00097
00104
00105
00106
00107 std::vector<std::string> sml_filename;
00108 std::vector<std::string> seq_filename;
00109 std::vector<SortedMerList*> sml_table;
00110 std::vector<genome::gnSequence*> seq_table;
00112 protected:
00113
00114 };
00115
00116 typedef GenericMatchList< Match* > MatchList;
00117
00118 CREATE_EXCEPTION( InvalidArgument );
00119
00123 CREATE_EXCEPTION(InvalidFileFormat)
00124
00125
00126
00134 void ReadList( MatchList& mlist, std::istream& match_stream );
00135
00140 void WriteList( const MatchList& mlist, std::ostream& match_stream );
00141
00142 typedef void* MatchID_t;
00143
00144 template< typename MatchPtrType >
00145 GenericMatchList< MatchPtrType >::GenericMatchList( const GenericMatchList< MatchPtrType >& ml ){
00146 *this = ml;
00147 }
00148
00149 template< typename MatchPtrType >
00150 GenericMatchList< MatchPtrType >& GenericMatchList< MatchPtrType >::operator=( const GenericMatchList< MatchPtrType >& ml ){
00151 std::vector< MatchPtrType >::operator=( ml );
00152 sml_filename = ml.sml_filename;
00153 seq_filename = ml.seq_filename;
00154 sml_table = ml.sml_table;
00155 seq_table = ml.seq_table;
00156 return *this;
00157 }
00158
00166 template< typename MatchListType >
00167 void LoadSequences( MatchListType& mlist, std::ostream* log_stream ){
00168
00169 if( mlist.seq_filename.size() == 0 )
00170 return;
00171
00172 for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
00173 genome::gnSequence* file_sequence = new genome::gnSequence();
00174
00175 try{
00176 file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
00177 }catch( genome::gnException& gne ){
00178 delete file_sequence;
00179 if( gne.GetCode() == genome::FileNotOpened() )
00180 std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
00181 else
00182 std::cerr << gne;
00183 return;
00184 }catch( std::exception& e ){
00185 delete file_sequence;
00186 std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
00187 std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00188 std::cerr << e.what();
00189 return;
00190 }catch( ... ){
00191 delete file_sequence;
00192 std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
00193 return;
00194 }
00195
00196 mlist.seq_table.push_back( file_sequence );
00197 if( log_stream != NULL ){
00198 (*log_stream) << "Sequence loaded successfully.\n";
00199 (*log_stream) << mlist.seq_filename[ seqI ] << " " << file_sequence->length() << " base pairs.\n";
00200 }
00201 }
00202
00203 }
00204
00212 template< typename MatchListType >
00213 void LoadAndCreateRawSequences( MatchListType& mlist, std::ostream* log_stream ){
00214
00215 if( mlist.seq_filename.size() == 0 )
00216 return;
00217
00218 for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
00219 genome::gnSequence* file_sequence = new genome::gnSequence();
00220
00221 try{
00222 file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
00223 }catch( genome::gnException& gne ){
00224 delete file_sequence;
00225 if( gne.GetCode() == genome::FileNotOpened() )
00226 std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
00227 else
00228 std::cerr << gne;
00229 return;
00230 }catch( std::exception& e ){
00231 delete file_sequence;
00232 std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
00233 std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00234 std::cerr << e.what();
00235 return;
00236 }catch( ... ){
00237 delete file_sequence;
00238 std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
00239 return;
00240 }
00241
00242
00243 std::string tmpfilename = "rawseq";
00244 tmpfilename = CreateTempFileName("rawseq");
00245 genome::gnRAWSource::Write( *file_sequence, tmpfilename );
00246 delete file_sequence;
00247 registerFileToDelete( tmpfilename );
00248
00249 if( log_stream != NULL )
00250 (*log_stream) << "Storing raw sequence at " << tmpfilename << std::endl;
00251 genome::gnRAWSequence* raw_seq = new genome::gnRAWSequence( tmpfilename );
00252 mlist.seq_table.push_back( raw_seq );
00253 if( log_stream != NULL ){
00254 (*log_stream) << "Sequence loaded successfully.\n";
00255 (*log_stream) << mlist.seq_filename[ seqI ] << " " << raw_seq->length() << " base pairs.\n";
00256 }
00257 }
00258 }
00259
00260
00261 template< typename MatchPtrType >
00262 void GenericMatchList< MatchPtrType >::LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank ){
00263
00264
00265 if( mer_size == 0 ){
00266 mer_size = GetDefaultMerSize( seq_table );
00267 if( log_stream != NULL ){
00268 (*log_stream) << "Using weight " << mer_size << " mers for initial seeds\n";
00269 }
00270 }
00271
00272
00273 uint64 default_seed = getSeed( mer_size, seed_rank );
00274 std::vector< uint > create_list;
00275 uint seqI = 0;
00276 for( seqI = 0; seqI < seq_table.size(); seqI++ ){
00277
00278 DNAFileSML* file_sml = new DNAFileSML();
00279 sml_table.push_back( file_sml );
00280
00281 boolean success = true;
00282 try{
00283 file_sml->LoadFile( sml_filename[ seqI ] );
00284 }catch( genome::gnException& gne ){
00285 success = false;
00286 create_list.push_back( seqI );
00287 }
00288 boolean recreate = false;
00289 if(success && (file_sml->Seed() != default_seed )){
00290 if( log_stream != NULL )
00291 (*log_stream) << "Default seed mismatch. A new sorted mer list will be created.\n";
00292 recreate = true;
00293 create_list.push_back( seqI );
00294 }
00295
00296 if( success && !recreate && log_stream != NULL )
00297 (*log_stream) << "Sorted mer list loaded successfully\n";
00298 }
00299
00300
00301 if( create_list.size() > 0 )
00302 for( seqI = 0; seqI < sml_table.size(); seqI++ ){
00303 sml_table[ seqI ]->Clear();
00304 delete sml_table[ seqI ];
00305 sml_table[ seqI ] = NULL;
00306 }
00307
00308
00309 for( uint createI = 0; createI < create_list.size(); createI++ ){
00310 if( log_stream != NULL )
00311 (*log_stream) << "Creating sorted mer list\n";
00312 try{
00313
00314 time_t start_time = time(NULL);
00315 sml_table[ create_list[ createI ] ] = new DNAFileSML( sml_filename[ create_list[ createI ] ] );
00316 sml_table[ create_list[ createI ] ]->Create( *seq_table[ create_list[ createI ] ], default_seed );
00317 time_t end_time = time(NULL);
00318 if( log_stream != NULL )
00319 (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
00320
00321 }catch(...){
00322 std::cerr << "Error creating sorted mer list\n";
00323 throw;
00324 }
00325 }
00326
00327
00328 if( create_list.size() > 0 ){
00329 for( seqI = 0; seqI < seq_filename.size(); seqI++ ){
00330 if( sml_table[ seqI ] != NULL )
00331 continue;
00332 sml_table[ seqI ] = new DNAFileSML( sml_filename[ seqI ] );
00333 try{
00334 ((DNAFileSML*)sml_table[ seqI ])->LoadFile( sml_filename[ seqI ] );
00335 }catch( genome::gnException& gne ){
00336 std::cerr << "Error loading sorted mer list\n";
00337 throw;
00338 }
00339 }
00340 }
00341 }
00342
00343 template< typename MatchPtrType >
00344 uint GenericMatchList< MatchPtrType >::GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table ){
00345 gnSeqI total_len = 0;
00346 for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
00347 total_len += seq_table[ seqI ]->length();
00348 return getDefaultSeedWeight( total_len / seq_table.size() );
00349 }
00350
00351
00363 template< typename MatchListType >
00364 void LoadMFASequences( MatchListType& mlist, const std::string& mfa_filename, std::ostream* log_stream ) {
00365 genome::gnSequence file_sequence;
00366
00367 try{
00368 file_sequence.LoadSource( mfa_filename );
00369 }catch( genome::gnException& gne ){
00370 if( gne.GetCode() == genome::FileNotOpened() )
00371 std::cerr << "Error loading " << mfa_filename << std::endl;
00372 else
00373 std::cerr << gne;
00374 return;
00375 }catch( std::exception& e ){
00376 std::cerr << "Unhandled exception loading " << mfa_filename << std::endl;
00377 std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00378 std::cerr << e.what();
00379 return;
00380 }catch( ... ){
00381 std::cerr << "Unknown exception when loading " << mfa_filename << std::endl;
00382 return;
00383 }
00384
00385 mlist.seq_filename.clear();
00386 gnSeqI total_len = 0;
00387 for( uint contigI = 0; contigI < file_sequence.contigListSize(); contigI++ ){
00388 genome::gnSequence* contig_seq = new genome::gnSequence( file_sequence.contig( contigI ) );
00389 mlist.seq_filename.push_back( mfa_filename );
00390
00391 if( log_stream != NULL ){
00392 (*log_stream) << "Sequence loaded successfully.\n";
00393 (*log_stream) << mlist.seq_filename[ contigI ] << " " << contig_seq->length() << " base pairs.\n";
00394 }
00395 mlist.seq_table.push_back( contig_seq );
00396 }
00397 }
00398
00399 template< typename MatchPtrType >
00400 void GenericMatchList< MatchPtrType >::CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank )
00401 {
00402
00403 if( mer_size == 0 ){
00404 mer_size = GetDefaultMerSize( seq_table );
00405 if( log_stream != NULL ){
00406 (*log_stream) << "Using " << mer_size << "-mers for initial seeds\n";
00407 }
00408 }
00409
00410 uint64 default_seed = getSeed( mer_size, seed_rank );
00411
00412
00413 for( uint contigI = 0; contigI < seq_table.size(); contigI++ )
00414 {
00415 DNAMemorySML* contig_sml = new DNAMemorySML();
00416 boolean success = true;
00417 if( log_stream != NULL )
00418 (*log_stream) << "Creating sorted mer list\n";
00419 time_t start_time = time(NULL);
00420 contig_sml->Create( *seq_table[contigI], default_seed );
00421 time_t end_time = time(NULL);
00422 if( log_stream != NULL )
00423 (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
00424
00425 sml_table.push_back( contig_sml );
00426 }
00427 }
00428
00429 template< typename MatchPtrType >
00430 void GenericMatchList< MatchPtrType >::Clear() {
00431 for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
00432 if( seq_table[ seqI ] != NULL )
00433 delete seq_table[ seqI ];
00434 }
00435 for( uint seqI = 0; seqI < sml_table.size(); seqI++ ){
00436 if( sml_table[ seqI ] != NULL )
00437 delete sml_table[ seqI ];
00438 }
00439 typename std::vector<MatchPtrType>::iterator match_iter = this->begin();
00440 for(; match_iter != this->end(); match_iter++ ){
00441 (*match_iter)->Free();
00442 (*match_iter) = NULL;
00443 }
00444 seq_table.clear();
00445 sml_table.clear();
00446 this->clear();
00447 seq_filename.clear();
00448 sml_filename.clear();
00449 }
00450
00454 template< class FromType, class ToType, class MatchListType >
00455 void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list );
00456
00457
00458 template< class FromType, class ToType, class MatchListType >
00459 void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list )
00460 {
00461
00462 typename MatchListType::iterator match_iter = match_list.begin();
00463
00464
00465 typename std::map<FromType, ToType>::iterator map_iter;
00466 for(; match_iter != match_list.end(); ++match_iter ){
00467
00468 std::set< Match* >& subsets = (*match_iter)->Subsets();
00469 std::set< Match* > new_subsets;
00470 std::set< Match* >::iterator sub_iter = subsets.begin();
00471 for(; sub_iter != subsets.end(); ++sub_iter ){
00472 map_iter = old_to_new_map.find( (FromType)*sub_iter );
00473 new_subsets.insert( map_iter->second );
00474 }
00475 subsets = new_subsets;
00476
00477
00478 std::set< Match* >& supersets = (*match_iter)->Supersets();
00479 std::set< Match* > new_supersets;
00480 std::set< Match* >::iterator super_iter = supersets.begin();
00481 for(; super_iter != supersets.end(); ++super_iter ){
00482 map_iter = old_to_new_map.find( (FromType)*super_iter );
00483 new_supersets.insert( map_iter->second );
00484 }
00485 supersets = new_supersets;
00486 }
00487 }
00488
00489 inline
00490 void ReadList(MatchList& mlist, std::istream& match_file)
00491 {
00492 std::string tag;
00493 gnSeqI len;
00494 int64 start;
00495 unsigned int seq_count;
00496
00497 match_file >> tag;
00498 if( tag != "FormatVersion" ){
00499 Throw_gnEx(InvalidFileFormat());
00500 }
00501 match_file >> tag;
00502 if( tag != "3" ){
00503 Throw_gnEx(InvalidFileFormat());
00504 }
00505 match_file >> tag;
00506 if( tag != "SequenceCount" ){
00507 Throw_gnEx(InvalidFileFormat());
00508 }
00509 match_file >> seq_count;
00510 if(seq_count < 2){
00511 Throw_gnEx(InvalidFileFormat());
00512 }
00513
00514
00515 for( unsigned int seqI = 0; seqI < seq_count; seqI++ ){
00516 match_file >> tag;
00517 std::getline( match_file, tag );
00518
00519 tag = tag.substr( 1 );
00520 mlist.seq_filename.push_back(tag);
00521 match_file >> tag;
00522 gnSeqI seq_len;
00523 match_file >> seq_len;
00524 if( seqI < mlist.seq_table.size() )
00525 if( mlist.seq_table[ seqI ]->length() != seq_len ){
00526 std::cerr << "Warning: Genome sizes in the match list differ.\n";
00527 std::cerr << "seq_table[ " << seqI << " ]->length() " << mlist.seq_table[ seqI ]->length() << " seq_len: " << seq_len << std::endl;
00528 }
00529 }
00530
00531
00532 unsigned int match_count;
00533 match_file >> tag;
00534 match_file >> match_count;
00535
00536
00537 std::map< MatchID_t, Match* > match_map;
00538 std::string cur_line;
00539 std::getline( match_file, cur_line );
00540 while( getline( match_file, cur_line ) ){
00541 Match mhe( seq_count );
00542 std::stringstream line_stream( cur_line );
00543
00544 line_stream >> len;
00545 mhe.SetLength(len);
00546
00547 for(uint32 seqI = 0; seqI < seq_count; seqI++){
00548 line_stream >> start;
00549 mhe.SetStart(seqI, start);
00550 }
00551
00552 MatchID_t match_id;
00553 line_stream >> match_id;
00554
00555 uint sub_count;
00556 boolean bad_stream = false;
00557 line_stream >> sub_count;
00558 if(sub_count > 0)
00559 throw "Unable to read file, invalid format, cannot read subset data\n";
00560
00561 if( bad_stream )
00562 break;
00563
00564 uint sup_count;
00565 line_stream >> sup_count;
00566 if(sub_count > 0)
00567 throw "Unable to read file, invalid format, cannot read superset data\n";
00568 if( bad_stream )
00569 break;
00570
00571 Match* new_match = mhe.Copy();
00572 mlist.push_back( new_match );
00573 match_map.insert( std::map< MatchID_t, Match* >::value_type( match_id, new_match ));
00574 }
00575 if( match_count != mlist.size() ){
00576 Throw_gnEx(InvalidFileFormat());
00577 }
00578 }
00579
00580 inline
00581 void WriteList( const MatchList& mlist, std::ostream& match_file)
00582 {
00583 if( mlist.size() == 0 )
00584 return;
00585 Match* first_mem = *(mlist.begin());
00586 unsigned int seq_count = first_mem->SeqCount();
00587
00588 match_file << "FormatVersion" << '\t' << 3 << "\n";
00589 match_file << "SequenceCount" << '\t' << seq_count << "\n";
00590 for(unsigned int seqI = 0; seqI < seq_count; seqI++){
00591 match_file << "Sequence" << seqI << "File" << '\t';
00592 if( mlist.seq_filename.size() > seqI )
00593 match_file << mlist.seq_filename[seqI];
00594 else
00595 match_file << "null";
00596 match_file << "\n";
00597 match_file << "Sequence" << seqI << "Length" << '\t';
00598 if( mlist.seq_table.size() > seqI )
00599 match_file << mlist.seq_table[seqI]->length();
00600 else
00601 match_file << "0";
00602 match_file << "\n";
00603 }
00604
00605 match_file << "MatchCount" << '\t' << mlist.size() << std::endl;
00606
00607
00608 std::vector<Match*>::const_iterator match_iter;
00609 match_iter = mlist.begin();
00610 std::set<Match*> cur_set;
00611 std::set<Match*>::iterator set_iter;
00612 for(; match_iter != mlist.end(); match_iter++){
00613
00614 match_file << **match_iter << '\t';
00615
00616
00617 match_file << (MatchID_t)(*match_iter) << '\t';
00618
00619
00620 match_file << 0;
00621
00622
00623 match_file << '\t' << 0;
00624 match_file << std::endl;
00625 }
00626 }
00627
00628 template< typename MatchPtrType >
00629 void GenericMatchList< MatchPtrType >::MultiplicityFilter( unsigned mult ){
00630
00631 size_t cur = 0;
00632 for( uint memI = 0; memI < this->size(); memI++ ){
00633 if( (*this)[ memI ]->Multiplicity() == mult )
00634 (*this)[cur++] = (*this)[memI];
00635 else{
00636 (*this)[ memI ]->Free();
00637 (*this)[ memI ] = NULL;
00638 }
00639 }
00640 this->resize(cur);
00641 }
00642
00643 template< typename MatchPtrType >
00644 void GenericMatchList< MatchPtrType >::LengthFilter( gnSeqI length ){
00645
00646 size_t cur = 0;
00647 for( size_t memI = 0; memI < this->size(); memI++ ){
00648 if( (*this)[ memI ]->Length() >= length )
00649 (*this)[cur++] = (*this)[memI];
00650 else{
00651 (*this)[ memI ]->Free();
00652 (*this)[ memI ] = NULL;
00653 }
00654 }
00655 this->resize(cur);
00656 }
00657
00658 }
00659
00660 #endif //_MatchList_h_