libMems/MatchList.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: MatchList.h,v 1.10 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef _MatchList_h_
00010 #define _MatchList_h_
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include <iostream>
00017 #include <list>
00018 #include "libMems/SortedMerList.h"
00019 #include "libMems/DNAFileSML.h"
00020 #include "libMems/DNAMemorySML.h"
00021 #include "libGenome/gnSequence.h"
00022 #include "libMems/Match.h"
00023 #include "libMems/gnRAWSequence.h"
00024 #include "libGenome/gnRAWSource.h"
00025 #include "libMems/Files.h"
00026 #include <sstream>
00027 #include <map>
00028 #include <ctime>
00029 
00030 namespace mems {
00031 
00032 template< typename MatchPtrType >
00033 class GenericMatchList : public std::vector< MatchPtrType > 
00034 {
00035 public:
00036         GenericMatchList(){};
00037         GenericMatchList( const GenericMatchList& ml );
00038         GenericMatchList& operator=( const GenericMatchList& ml );
00039 
00040 
00051         void LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0 );
00052 
00065         void CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank = 0 );
00066 
00071         static uint GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table );
00072         
00077         void Clear();
00078         
00083         void MultiplicityFilter( unsigned mult );
00084 
00089         void LengthFilter( gnSeqI length );
00090 
00097 //      void ExactFilter( valarray< bool >& filter_spec );
00104 //      void IntersectFilter( valarray< bool >& filter_spec );
00105 
00106         
00107         std::vector<std::string> sml_filename;          
00108         std::vector<std::string> seq_filename;          
00109         std::vector<SortedMerList*> sml_table;  
00110         std::vector<genome::gnSequence*> seq_table;             
00112 protected:
00113 
00114 };
00115 
00116 typedef GenericMatchList< Match* > MatchList;
00117 
00118 CREATE_EXCEPTION( InvalidArgument );
00119 
00123 CREATE_EXCEPTION(InvalidFileFormat)
00124 
00125 
00126 
00134 void ReadList( MatchList& mlist, std::istream& match_stream );
00135 
00140 void WriteList( const MatchList& mlist, std::ostream& match_stream );
00141 
00142 typedef void* MatchID_t;
00143 
00144 template< typename MatchPtrType >
00145 GenericMatchList< MatchPtrType >::GenericMatchList( const GenericMatchList< MatchPtrType >& ml ){
00146         *this = ml;
00147 }
00148 
00149 template< typename MatchPtrType >
00150 GenericMatchList< MatchPtrType >& GenericMatchList< MatchPtrType >::operator=( const GenericMatchList< MatchPtrType >& ml ){
00151         std::vector< MatchPtrType >::operator=( ml );
00152         sml_filename = ml.sml_filename;
00153         seq_filename = ml.seq_filename;
00154         sml_table = ml.sml_table;
00155         seq_table = ml.seq_table;
00156         return *this;
00157 }
00158 
00166 template< typename MatchListType >
00167 void LoadSequences( MatchListType& mlist, std::ostream* log_stream ){
00168         
00169         if( mlist.seq_filename.size() == 0 )
00170                 return;
00171 
00172         for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
00173                 genome::gnSequence* file_sequence = new genome::gnSequence();
00174                 // Load the sequence and tell the user if it loaded successfully
00175                 try{
00176                         file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
00177                 }catch( genome::gnException& gne ){
00178                         delete file_sequence;
00179                         if( gne.GetCode() == genome::FileNotOpened() )
00180                                 std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
00181                         else
00182                                 std::cerr << gne;
00183                         return;
00184                 }catch( std::exception& e ){
00185                         delete file_sequence;
00186                         std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
00187                         std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00188                         std::cerr << e.what();
00189                         return;
00190                 }catch( ... ){
00191                         delete file_sequence;
00192                         std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
00193                         return;
00194                 }
00195                 
00196                 mlist.seq_table.push_back( file_sequence );
00197                 if( log_stream != NULL ){
00198                         (*log_stream) << "Sequence loaded successfully.\n";
00199                         (*log_stream) << mlist.seq_filename[ seqI ] << " " << file_sequence->length() << " base pairs.\n";
00200                 }
00201         }
00202 
00203 }
00204 
00212 template< typename MatchListType >
00213 void LoadAndCreateRawSequences( MatchListType& mlist, std::ostream* log_stream ){
00214         
00215         if( mlist.seq_filename.size() == 0 )
00216                 return;
00217 
00218         for( uint seqI = 0; seqI < mlist.seq_filename.size(); seqI++ ){
00219                 genome::gnSequence* file_sequence = new genome::gnSequence();
00220                 // Load the sequence and tell the user if it loaded successfully
00221                 try{
00222                         file_sequence->LoadSource( mlist.seq_filename[ seqI ] );
00223                 }catch( genome::gnException& gne ){
00224                         delete file_sequence;
00225                         if( gne.GetCode() == genome::FileNotOpened() )
00226                                 std::cerr << "Error loading " << mlist.seq_filename[ seqI ] << std::endl;
00227                         else
00228                                 std::cerr << gne;
00229                         return;
00230                 }catch( std::exception& e ){
00231                         delete file_sequence;
00232                         std::cerr << "Unhandled exception loading " << mlist.seq_filename[ seqI ] << std::endl;
00233                         std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00234                         std::cerr << e.what();
00235                         return;
00236                 }catch( ... ){
00237                         delete file_sequence;
00238                         std::cerr << "Unknown exception when loading " << mlist.seq_filename[ seqI ] << std::endl;
00239                         return;
00240                 }
00241 
00242                 // now create a temporary raw sequence
00243                 std::string tmpfilename = "rawseq";
00244                 tmpfilename = CreateTempFileName("rawseq");
00245                 genome::gnRAWSource::Write( *file_sequence, tmpfilename );
00246                 delete file_sequence;
00247                 registerFileToDelete( tmpfilename );
00248 
00249                 if( log_stream != NULL )
00250                         (*log_stream) << "Storing raw sequence at " << tmpfilename << std::endl;        
00251                 genome::gnRAWSequence* raw_seq = new genome::gnRAWSequence( tmpfilename );
00252                 mlist.seq_table.push_back( raw_seq );
00253                 if( log_stream != NULL ){
00254                         (*log_stream) << "Sequence loaded successfully.\n";
00255                         (*log_stream) << mlist.seq_filename[ seqI ] << " " << raw_seq->length() << " base pairs.\n";
00256                 }
00257         }
00258 }
00259 
00260 
00261 template< typename MatchPtrType >
00262 void GenericMatchList< MatchPtrType >::LoadSMLs( uint mer_size, std::ostream* log_stream, int seed_rank ){
00263 
00264         // if the mer_size parameter is 0 then calculate a default mer size for these sequences
00265         if( mer_size == 0 ){
00266                 mer_size = GetDefaultMerSize( seq_table );
00267                 if( log_stream != NULL ){
00268                         (*log_stream) << "Using weight " << mer_size << " mers for initial seeds\n";
00269                 }
00270         }
00271 
00272         // load and creates SMLs as necessary
00273         uint64 default_seed = getSeed( mer_size, seed_rank );
00274         std::vector< uint > create_list;
00275         uint seqI = 0;
00276         for( seqI = 0; seqI < seq_table.size(); seqI++ ){
00277                 // define a DNAFileSML to store a sorted mer list
00278                 DNAFileSML* file_sml = new DNAFileSML();
00279                 sml_table.push_back( file_sml );
00280 
00281                 boolean success = true;
00282                 try{
00283                         file_sml->LoadFile( sml_filename[ seqI ] );
00284                 }catch( genome::gnException& gne ){
00285                         success = false;
00286                         create_list.push_back( seqI );
00287                 }
00288                 boolean recreate = false;
00289                 if(success && (file_sml->Seed() != default_seed )){
00290                         if( log_stream != NULL )
00291                                 (*log_stream) << "Default seed mismatch.  A new sorted mer list will be created.\n";
00292                         recreate = true;
00293                         create_list.push_back( seqI );
00294                 }
00295 
00296                 if( success && !recreate && log_stream != NULL )
00297                         (*log_stream) << "Sorted mer list loaded successfully\n";
00298         }
00299 
00300         // free up memory before creating any SMLs
00301         if( create_list.size() > 0 )
00302                 for( seqI = 0; seqI < sml_table.size(); seqI++ ){
00303                         sml_table[ seqI ]->Clear();
00304                         delete sml_table[ seqI ];
00305                         sml_table[ seqI ] = NULL;
00306                 }
00307         
00308         // create any SMLs that need to be created
00309         for( uint createI = 0; createI < create_list.size(); createI++ ){
00310                 if( log_stream != NULL )
00311                         (*log_stream) << "Creating sorted mer list\n";
00312                 try{
00313 
00314                 time_t start_time = time(NULL);
00315                 sml_table[ create_list[ createI ] ] = new DNAFileSML( sml_filename[ create_list[ createI ] ] );
00316                 sml_table[ create_list[ createI ] ]->Create( *seq_table[ create_list[ createI ] ], default_seed );
00317                 time_t end_time = time(NULL);
00318                 if( log_stream != NULL )
00319                         (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
00320                 
00321                 }catch(...){
00322                         std::cerr << "Error creating sorted mer list\n";
00323                         throw;
00324                 }
00325         }
00326         
00327         // reload the other SMLs now that creation has completed
00328         if( create_list.size() > 0 ){
00329                 for( seqI = 0; seqI < seq_filename.size(); seqI++ ){
00330                         if( sml_table[ seqI ] != NULL )
00331                                 continue;
00332                         sml_table[ seqI ] = new DNAFileSML( sml_filename[ seqI ] );
00333                         try{
00334                                 ((DNAFileSML*)sml_table[ seqI ])->LoadFile( sml_filename[ seqI ] );
00335                         }catch( genome::gnException& gne ){
00336                                 std::cerr << "Error loading sorted mer list\n";
00337                                 throw;
00338                         }
00339                 }
00340         }
00341 }
00342 
00343 template< typename MatchPtrType >
00344 uint GenericMatchList< MatchPtrType >::GetDefaultMerSize( const std::vector< genome::gnSequence* >& seq_table ){
00345         gnSeqI total_len = 0;
00346         for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
00347                 total_len += seq_table[ seqI ]->length();
00348         return getDefaultSeedWeight( total_len / seq_table.size() );
00349 }
00350 
00351 
00363 template< typename MatchListType >
00364 void LoadMFASequences( MatchListType& mlist, const std::string& mfa_filename, std::ostream* log_stream ) {
00365         genome::gnSequence file_sequence;
00366         // Load the sequence and tell the user if it loaded successfully
00367         try{
00368                 file_sequence.LoadSource( mfa_filename );
00369         }catch( genome::gnException& gne ){
00370                 if( gne.GetCode() == genome::FileNotOpened() )
00371                         std::cerr << "Error loading " << mfa_filename << std::endl;
00372                 else
00373                         std::cerr << gne;
00374                 return;
00375         }catch( std::exception& e ){
00376                 std::cerr << "Unhandled exception loading " << mfa_filename << std::endl;
00377                 std::cerr << "At: " << __FILE__ << ":" << __LINE__ << std::endl;
00378                 std::cerr << e.what();
00379                 return;
00380         }catch( ... ){
00381                 std::cerr << "Unknown exception when loading " << mfa_filename << std::endl;
00382                 return;
00383         }
00384 
00385         mlist.seq_filename.clear();
00386         gnSeqI total_len = 0;
00387         for( uint contigI = 0; contigI < file_sequence.contigListSize(); contigI++ ){
00388                 genome::gnSequence* contig_seq = new genome::gnSequence( file_sequence.contig( contigI ) );
00389                 mlist.seq_filename.push_back( mfa_filename );
00390 //              mlist.seq_filename.push_back( file_sequence.contigName( contigI ) );
00391                 if( log_stream != NULL ){
00392                         (*log_stream) << "Sequence loaded successfully.\n";
00393                         (*log_stream) << mlist.seq_filename[ contigI ] << " " << contig_seq->length() << " base pairs.\n";
00394                 }
00395                 mlist.seq_table.push_back( contig_seq );
00396         }
00397 }
00398 
00399 template< typename MatchPtrType >
00400 void GenericMatchList< MatchPtrType >::CreateMemorySMLs( uint mer_size, std::ostream* log_stream, int seed_rank ) 
00401 {
00402         // if the mer_size parameter is 0 then calculate a default mer size for these sequences
00403         if( mer_size == 0 ){
00404                 mer_size = GetDefaultMerSize( seq_table );
00405                 if( log_stream != NULL ){
00406                         (*log_stream) << "Using " << mer_size << "-mers for initial seeds\n";
00407                 }
00408         }
00409 
00410         uint64 default_seed = getSeed( mer_size, seed_rank );
00411 
00412         // define a DNAMemorySML to store a sorted mer list
00413         for( uint contigI = 0; contigI < seq_table.size(); contigI++ )
00414         {
00415                 DNAMemorySML* contig_sml = new DNAMemorySML();
00416                 boolean success = true;
00417                 if( log_stream != NULL )
00418                         (*log_stream) << "Creating sorted mer list\n";
00419                 time_t start_time = time(NULL);
00420                 contig_sml->Create( *seq_table[contigI], default_seed );
00421                 time_t end_time = time(NULL);
00422                 if( log_stream != NULL )
00423                         (*log_stream) << "Create time was: " << end_time - start_time << " seconds.\n";
00424                 
00425                 sml_table.push_back( contig_sml );
00426         }
00427 }
00428 
00429 template< typename MatchPtrType >
00430 void GenericMatchList< MatchPtrType >::Clear() {
00431         for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
00432                 if( seq_table[ seqI ] != NULL )
00433                         delete seq_table[ seqI ];
00434         }
00435         for( uint seqI = 0; seqI < sml_table.size(); seqI++ ){
00436                 if( sml_table[ seqI ] != NULL )
00437                         delete sml_table[ seqI ];
00438         }
00439         typename std::vector<MatchPtrType>::iterator match_iter = this->begin();
00440         for(; match_iter != this->end(); match_iter++ ){
00441                 (*match_iter)->Free();
00442                 (*match_iter) = NULL;
00443         }
00444         seq_table.clear();
00445         sml_table.clear();
00446         this->clear();
00447         seq_filename.clear();
00448         sml_filename.clear();
00449 }
00450 
00454 template< class FromType, class ToType, class MatchListType >
00455 void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list );
00456 
00457 
00458 template< class FromType, class ToType, class MatchListType >
00459 void RemapSubsetMatchAddresses( std::map<FromType, ToType>& old_to_new_map, MatchListType& match_list )
00460 {
00461         // now remap the subset and superset links
00462         typename MatchListType::iterator match_iter = match_list.begin();
00463         //typedef typename MatchListType::value_type MatchType;
00464         //typedef typename Match MatchType;
00465         typename std::map<FromType, ToType>::iterator map_iter;
00466         for(; match_iter != match_list.end(); ++match_iter ){
00467                 // remap all subsets
00468                 std::set< Match* >& subsets = (*match_iter)->Subsets();
00469                 std::set< Match* > new_subsets;
00470                 std::set< Match* >::iterator sub_iter = subsets.begin();
00471                 for(; sub_iter != subsets.end(); ++sub_iter ){
00472                         map_iter = old_to_new_map.find( (FromType)*sub_iter );
00473                         new_subsets.insert( map_iter->second );
00474                 }
00475                 subsets = new_subsets;
00476 
00477                 // remap all supersets
00478                 std::set< Match* >& supersets = (*match_iter)->Supersets();
00479                 std::set< Match* > new_supersets;
00480                 std::set< Match* >::iterator super_iter = supersets.begin();
00481                 for(; super_iter != supersets.end(); ++super_iter ){
00482                         map_iter = old_to_new_map.find( (FromType)*super_iter );
00483                         new_supersets.insert( map_iter->second );
00484                 }
00485                 supersets = new_supersets;
00486         }
00487 }
00488 
00489 inline
00490 void ReadList(MatchList& mlist, std::istream& match_file)
00491 {
00492         std::string tag;
00493         gnSeqI len;
00494         int64 start;
00495         unsigned int seq_count;
00496         
00497         match_file >> tag;      //format version tag
00498         if( tag != "FormatVersion" ){
00499                 Throw_gnEx(InvalidFileFormat());
00500         }
00501         match_file >> tag;      //format version
00502         if( tag != "3" ){
00503                 Throw_gnEx(InvalidFileFormat());
00504         }
00505         match_file >> tag;      //sequence count tag
00506         if( tag != "SequenceCount" ){
00507                 Throw_gnEx(InvalidFileFormat());
00508         }
00509         match_file >> seq_count;        //sequence count
00510         if(seq_count < 2){
00511                 Throw_gnEx(InvalidFileFormat());
00512         }
00513         
00514         // read the sequence file names and lengths
00515         for( unsigned int seqI = 0; seqI < seq_count; seqI++ ){         
00516                 match_file >> tag;      // name tag
00517                 std::getline( match_file, tag );
00518                 // skip the tab character
00519                 tag = tag.substr( 1 );
00520                 mlist.seq_filename.push_back(tag);
00521                 match_file >> tag;      // length tag
00522                 gnSeqI seq_len;
00523                 match_file >> seq_len;  // length
00524                 if( seqI < mlist.seq_table.size() )
00525                         if( mlist.seq_table[ seqI ]->length() != seq_len ){
00526                                 std::cerr << "Warning: Genome sizes in the match list differ.\n";
00527                                 std::cerr << "seq_table[ " << seqI << " ]->length() " << mlist.seq_table[ seqI ]->length() << " seq_len: " << seq_len << std::endl;
00528                         }
00529         }
00530 
00531         // read the number of matches
00532         unsigned int match_count;
00533         match_file >> tag;      // match count tag
00534         match_file >> match_count;      // match count
00535                 
00536         // read the matches
00537         std::map< MatchID_t, Match* > match_map;
00538         std::string cur_line;
00539         std::getline( match_file, cur_line );
00540         while( getline( match_file, cur_line ) ){
00541                 Match mhe( seq_count );
00542                 std::stringstream line_stream( cur_line );
00543                 
00544                 line_stream >> len;
00545                 mhe.SetLength(len);
00546 
00547                 for(uint32 seqI = 0; seqI < seq_count; seqI++){
00548                         line_stream >> start;
00549                         mhe.SetStart(seqI, start);
00550                 }
00551                 
00552                 MatchID_t match_id;
00553                 line_stream >> match_id;
00554                 
00555                 uint sub_count;
00556                 boolean bad_stream = false;
00557                 line_stream >> sub_count;
00558                 if(sub_count > 0)
00559                         throw "Unable to read file, invalid format, cannot read subset data\n";
00560 
00561                 if( bad_stream )
00562                         break;
00563 
00564                 uint sup_count;
00565                 line_stream >> sup_count;
00566                 if(sub_count > 0)
00567                         throw "Unable to read file, invalid format, cannot read superset data\n";
00568                 if( bad_stream )
00569                         break;
00570                 
00571                 Match* new_match = mhe.Copy();
00572                 mlist.push_back( new_match );
00573                 match_map.insert( std::map< MatchID_t, Match* >::value_type( match_id, new_match ));
00574         }
00575         if( match_count != mlist.size() ){
00576                 Throw_gnEx(InvalidFileFormat());
00577         }
00578 }
00579 
00580 inline
00581 void WriteList( const MatchList& mlist, std::ostream& match_file)
00582 {
00583         if( mlist.size() == 0 )
00584                 return;
00585         Match* first_mem = *(mlist.begin());
00586         unsigned int seq_count = first_mem->SeqCount();
00587 
00588         match_file << "FormatVersion" << '\t' << 3 << "\n";
00589         match_file << "SequenceCount" << '\t' << seq_count << "\n";
00590         for(unsigned int seqI = 0; seqI < seq_count; seqI++){
00591                 match_file << "Sequence" << seqI << "File" << '\t';
00592                 if( mlist.seq_filename.size() > seqI )
00593                         match_file << mlist.seq_filename[seqI];
00594                 else
00595                         match_file << "null";
00596                 match_file << "\n";
00597                 match_file << "Sequence" << seqI << "Length" << '\t';
00598                 if( mlist.seq_table.size() > seqI )
00599                         match_file << mlist.seq_table[seqI]->length();
00600                 else
00601                         match_file << "0";
00602                 match_file << "\n";
00603         }
00604 
00605         match_file << "MatchCount" << '\t' << mlist.size() << std::endl;
00606 
00607         //get all the mems out of the hash table and write them out
00608         std::vector<Match*>::const_iterator match_iter;
00609         match_iter = mlist.begin();
00610         std::set<Match*> cur_set;
00611         std::set<Match*>::iterator set_iter;
00612         for(; match_iter != mlist.end(); match_iter++){
00613                 // print the match
00614                 match_file << **match_iter << '\t';
00615 
00616                 // print the match address
00617                 match_file << (MatchID_t)(*match_iter) << '\t';
00618                 
00619                 // print subset id's
00620                 match_file << 0;
00621 
00622                 // print superset id's
00623                 match_file << '\t' << 0;
00624                 match_file << std::endl;
00625         }
00626 }
00627 
00628 template< typename MatchPtrType >
00629 void GenericMatchList< MatchPtrType >::MultiplicityFilter( unsigned mult ){
00630 
00631         size_t cur = 0;
00632         for( uint memI = 0; memI < this->size(); memI++ ){
00633                 if( (*this)[ memI ]->Multiplicity() == mult )
00634                         (*this)[cur++] = (*this)[memI];
00635                 else{
00636                         (*this)[ memI ]->Free();
00637                         (*this)[ memI ] = NULL;
00638                 }
00639         }
00640         this->resize(cur);
00641 }
00642 
00643 template< typename MatchPtrType >
00644 void GenericMatchList< MatchPtrType >::LengthFilter( gnSeqI length ){
00645 
00646         size_t cur = 0;
00647         for( size_t memI = 0; memI < this->size(); memI++ ){
00648                 if( (*this)[ memI ]->Length() >= length )
00649                         (*this)[cur++] = (*this)[memI];
00650                 else{
00651                         (*this)[ memI ]->Free();
00652                         (*this)[ memI ] = NULL;
00653                 }
00654         }
00655         this->resize(cur);
00656 }
00657 
00658 }
00659 
00660 #endif  //_MatchList_h_

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6