libMems/FileSML.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: FileSML.cpp,v 1.22 2004/04/26 21:13:58 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 
00010 #include "libMems/FileSML.h"
00011 // for CreateTempFileName():
00012 #include "libMems/Aligner.h"
00013 #include "libGenome/gnFilter.h"
00014 #include "libGenome/gnRAWSource.h"
00015 #include <algorithm>
00016 #include <cmath>
00017 #include "boost/filesystem/operations.hpp"
00018 
00019 using namespace std;
00020 using namespace genome;
00021 namespace mems {
00022 
00023 FileSML& FileSML::operator=(const FileSML& sa)
00024 {
00025         SortedMerList::operator=( sa );
00026         filename = sa.filename;
00027         sarray_start_offset = sa.sarray_start_offset;
00028         seq_coords = sa.seq_coords;
00029         sarfile.open(filename.c_str(), ios::binary | ios::in );
00030         if(!sarfile.is_open()){
00031                 DebugMsg("FileSML::=: Unable to open suffix array file.\n");
00032                 sarfile.clear();
00033                 return *this;
00034         }
00035         return *this;
00036 }
00037 
00038 void FileSML::Clear() {
00039         SortedMerList::Clear();
00040         filename = "";
00041         sarfile.close();
00042         sarray_start_offset = 0;
00043         seq_coords.clear();
00044 }
00045 
00046 void FileSML::LoadFile(const string& fname){
00047         filename = fname;
00048         sarfile.open(fname.c_str(), ios::binary | ios::in );
00049         if(!sarfile.is_open()){
00050                 sarfile.clear();
00051                 Throw_gnExMsg( FileNotOpened(), "Unable to open file.\n");
00052         }
00053         // read the header into a temporary header struct just
00054         // in case it's bogus
00055         SMLHeader tmp_header;
00056         sarfile.read((char*)&tmp_header, sizeof(struct SMLHeader));
00057         if(sarfile.gcount() < (int)sizeof(struct SMLHeader)){
00058                 sarfile.clear();
00059                 Throw_gnExMsg( FileUnreadable(), "Unable to read file.");
00060         }
00061         if(tmp_header.version != FormatVersion()){
00062                 Throw_gnExMsg( FileUnreadable(), "Unsupported file format.");
00063         }
00064         header = tmp_header;
00065         
00066         SetMerMaskSize( header.seed_weight );
00067         seed_mask = mer_mask;
00068         SetMerMaskSize( header.seed_length );
00069 
00070         //header is ok.  read the sequence.
00071         gnSeqI seq_len = header.length;
00072         if(header.circular)
00073                 seq_len += header.seed_length - 1;
00074         binary_seq_len = ((uint64)seq_len * (uint64)header.alphabet_bits) / 32;
00075         if(((uint64)seq_len * (uint64)header.alphabet_bits) % 32 != 0)
00076                 binary_seq_len++;
00077         binary_seq_len+=2;      //fix for access violations.
00078 
00079         if(sequence != NULL)
00080                 delete[] sequence;
00081         sequence = new uint32[binary_seq_len];
00082 
00083         sarfile.read((char*)sequence, binary_seq_len*sizeof(uint32));
00084         if(sarfile.gcount() < (int64)(binary_seq_len*sizeof(uint32))){
00085                 sarfile.clear();
00086                 Throw_gnExMsg( FileUnreadable(), "Error reading sequence data.");
00087         }
00088 
00089         sarray_start_offset = sarfile.tellg();
00090         sarfile.seekg(sarray_start_offset + sizeof(gnSeqI) * header.length);
00091         if(!sarfile.good()){
00092                 sarfile.clear();
00093                 Throw_gnExMsg( FileUnreadable(), "Premature end of file.");
00094         }
00095         filename = fname;
00096 
00097         // create a memory-map to the data of interest
00098         sardata.open(fname);
00099         
00100         // check whether there is a .coords mask file to read
00101         string coordfile = filename + ".coords";
00102         ifstream coord_in( coordfile.c_str() );
00103         if( coord_in.is_open() ){
00104                 seq_coords.clear();
00105                 int64 cur_coord;
00106                 while( coord_in >> cur_coord ){
00107                         seq_coords.push_back( cur_coord );
00108                 }
00109         }
00110 }
00111 
00112 void FileSML::OpenForWriting( boolean truncate ){
00113         // Open smlfile for writing
00114         boolean was_open = sarfile.is_open();
00115         if(was_open)
00116                 sarfile.close();
00117         if( truncate )
00118                 sarfile.open(filename.c_str(), ios::binary | ios::in | ios::out | ios::trunc );
00119         else
00120                 sarfile.open(filename.c_str(), ios::binary | ios::in | ios::out );
00121         if(!sarfile.is_open() || !sarfile.good()){
00122                 sarfile.clear();
00123                 if(was_open)
00124                         sarfile.open(filename.c_str(), ios::binary | ios::in );
00125                 Throw_gnExMsg(FileNotOpened(), "Unable to open file for writing.");
00126         }
00127 }
00128 
00129 boolean FileSML::WriteHeader(){
00130         if(!sarfile.is_open()){
00131                 Throw_gnExMsg(IOStreamFailed(), "File is not valid.");
00132         }
00133         boolean success = true;
00134         const char* errormsg = "";
00135         // Open sarfile for writing and write new header.
00136         OpenForWriting( false );
00137         sarfile.write((char*)&header, sizeof(struct SMLHeader));
00138         if(!sarfile.good()){
00139                 errormsg = "Error writing header to disk.";
00140                 success = false;
00141         }
00142 
00143         // reopen the sorted mer list file read-only
00144         sarfile.close();
00145         sarfile.open(filename.c_str(), ios::binary | ios::in );
00146         if(!sarfile.is_open()){
00147                 errormsg = "Error opening sorted mer list file.";
00148                 success = false;
00149         }
00150         if(!success)
00151                 Throw_gnExMsg(IOStreamFailed(), errormsg);
00152         return success;
00153 }
00154 
00155 gnSeqI FileSML::UniqueMerCount(){
00156         gnSeqI tmp_count = header.unique_mers;
00157         SortedMerList::UniqueMerCount();
00158         if(tmp_count != header.unique_mers)
00159                 WriteHeader();
00160         return header.unique_mers;
00161 }
00162 
00163 //change the description in memory and on disk
00164 void FileSML::SetDescription(const string& d){
00165         strncpy(header.description, d.c_str(), DESCRIPTION_SIZE-1);
00166         WriteHeader();
00167 } 
00168 
00169 void FileSML::SetID(const sarID_t d){
00170         header.id = d;
00171         WriteHeader();
00172 }
00173 
00174 
00175 extern "C" {
00176 #include "libMems/dmSML/dmsort.h"
00177 }
00178 
00179 char** FileSML::tmp_paths = NULL;
00180 
00181 void FileSML::registerTempPath( const string& path ) {
00182         string tmp_path = path;
00183         // add trailing path separator if necessary
00184 #ifdef WIN32
00185         if( tmp_path[ tmp_path.size() - 1 ] != '\\' )
00186                 tmp_path += "\\";
00187 #else
00188         if( tmp_path[ tmp_path.size() - 1 ] != '/' )
00189                 tmp_path += "/";
00190 #endif
00191                 
00192         if( tmp_paths == NULL ){
00193                 tmp_paths = new char*[1];
00194                 tmp_paths[ 0 ] = NULL;
00195         }
00196         
00197         int path_count = 0;
00198         while( tmp_paths[ path_count ] != NULL )
00199                 path_count++;
00200 
00201         // create a new array with room for another element
00202         char** tmp_tmp_paths = new char*[ path_count+1 ];
00203         // copy old elements
00204         for( int pathI = 0; pathI < path_count; pathI++ )
00205                 tmp_tmp_paths[ pathI ] = tmp_paths[ pathI ];
00206         // add new element
00207         tmp_tmp_paths[ path_count ] = new char[ tmp_path.size() + 1 ];
00208         strncpy( tmp_tmp_paths[ path_count ], tmp_path.c_str(), tmp_path.size() + 1 );
00209         // set null terminator element
00210         tmp_tmp_paths[ path_count + 1 ] = NULL;
00211         
00212         // set new paths
00213         char** old_paths = tmp_paths;
00214         tmp_paths = tmp_tmp_paths;
00215         
00216         // free old array
00217         delete[] old_paths;
00218 }
00219 
00220 const char* FileSML::getTempPath( int pathI ){
00221         return tmp_paths[ pathI ];
00222 }
00223 
00224 int FileSML::getTempPathCount(){
00225         int path_count = 0;
00226         while( tmp_paths && tmp_paths[ path_count ] != NULL )
00227                 path_count++;
00228         return path_count;
00229 }
00230 
00231 
00232 void maskNNNNN( const gnSequence& in_seq, gnSequence& out_seq, vector< int64 >& seq_coords, int mask_n_length ) {
00233         
00234         gnSeqI seqI = 1;
00235         gnSeqI read_length = 1024*1024;
00236         string cur_seq;
00237         gnSeqI n_count = 0;
00238         gnSeqI n_stretch_start = 0;
00239         gnSeqI n_stretch_end = 1;
00240 
00241         while( seqI <= in_seq.length() ){
00242                 read_length = seqI + read_length < in_seq.length() ? read_length : in_seq.length() - seqI + 1;
00243                 in_seq.ToString( cur_seq, read_length, seqI );
00244                 
00245                 uint charI = 0;
00246                 for( ; charI < cur_seq.size(); charI++ ){
00247                         if( cur_seq[ charI ] == 'N' || cur_seq[ charI ] == 'n' ){
00248                                 if( n_count == 0 ){
00249                                         n_stretch_start = seqI + charI;
00250                                 }
00251                                 n_count++;
00252                         }else{
00253                                 if( n_count > mask_n_length ){
00254                                         if( n_stretch_start - n_stretch_end != 0 ){
00255                                                 // Add the sequence region to the output sequence
00256                                                 out_seq += in_seq.subseq( n_stretch_end, n_stretch_start - n_stretch_end );
00257                                                 // add the masked coordinates
00258                                                 seq_coords.push_back( n_stretch_end );
00259                                                 seq_coords.push_back( n_stretch_start - 1 );
00260                                         }
00261                                         // update n_stretch_end to the first non N character
00262                                         n_stretch_end = seqI + charI;
00263                                 }
00264                                 n_count = 0;
00265                         }
00266                 }
00267                 seqI += read_length;
00268         }
00269         out_seq += in_seq.subseq( n_stretch_end, seqI - n_stretch_end );
00270 
00271         // add the masked coordinates
00272         seq_coords.push_back( n_stretch_end );
00273         seq_coords.push_back( seqI - 1 );
00274 }
00275 
00276         // use dmSML to construct the SML
00277         // then read it in using LoadFile()
00278 void FileSML::dmCreate(const gnSequence& seq, const uint64 seed){
00279         // Filter NNNNNs
00280         gnSequence masked_seq;
00281         seq_coords.clear();
00282         maskNNNNN( seq, masked_seq, seq_coords, 0 );
00283         
00284         // write a raw sequence to a tmp file stored in the first scratch path
00285         string rawfile = CreateTempFileName("dm_rawseq");
00286         gnRAWSource::Write( masked_seq, rawfile.c_str() );
00287         
00288         // write a sequence coordinate file
00289         if( seq_coords.size() > 0 ){
00290                 string coordfile = filename + ".coords";
00291                 ofstream coord_out( coordfile.c_str() );
00292                 if( !coord_out.is_open() ){
00293                         cerr << "Could not open " << coordfile << endl;
00294                         throw "";
00295                 }
00296                 
00297                 for( int coordI = 0; coordI < seq_coords.size(); coordI+=2 ){
00298                         coord_out << seq_coords[ coordI ] << '\t' << seq_coords[ coordI + 1 ] << endl;
00299                 }
00300                 coord_out.close();
00301         }
00302         
00303         
00304         // run dmSML
00305         const char* const* scratch_paths = (const char* const*)tmp_paths;
00306         sarfile.close();
00307         int rval = dmSML( rawfile.c_str(), filename.c_str(), scratch_paths, seed );
00308         if( rval != 0 )
00309                 cerr << "Crap.  It's broke, return value " << rval << endl;
00310         
00311         boost::filesystem::remove( rawfile );
00312         // load the sorted mer list
00313         LoadFile( filename );
00314 }
00315 
00316 void FileSML::Create(const gnSequence& seq, const uint64 seed){
00317 
00318         vector<bmer> sml_array;
00319         bool is_spaced_seed = getSeedWeight(seed) != getSeedLength(seed);
00320         OpenForWriting( true );
00321 
00322         try{
00323                 SortedMerList::Create( seq, seed );
00324                 
00325                 if( is_spaced_seed )
00326                         FillDnaSeedSML(seq, sml_array);
00327                 else
00328                         FillSML(seq, sml_array);
00329 
00330         }catch(...){    
00331                 // if there was a memory allocation error then
00332                 // try using dmSML to do an external sort
00333                 sarfile.clear();
00334                 sarfile.close();
00335                 sarfile.clear();
00336                 if( sequence != NULL )
00337                         delete[] sequence;
00338                 binary_seq_len = 0;
00339 
00340                 dmCreate( seq, seed );
00341         }
00342 
00343 //      RadixSort(s_array);
00344         sort(sml_array.begin(), sml_array.end(), &bmer_lessthan);
00345         
00346         /* now write out the file header */
00347         sarfile.write((char*)&header, sizeof(struct SMLHeader));
00348 
00349         if(!sarfile.good()){
00350                 sarfile.clear();
00351                 Throw_gnExMsg( IOStreamFailed(), "Error writing sorted mer list header to disk.\n");
00352         }
00353 
00354         /* write out the actual sequence */
00355         sarfile.write((char*)sequence, binary_seq_len*sizeof(uint32));
00356         sarray_start_offset = sarfile.tellg();
00357 
00358         /* write out the sorted mer list */
00359         for(gnSeqI suffixI=0; suffixI < sml_array.size(); suffixI++)
00360                 sarfile.write((char*)&(sml_array[suffixI].position), sizeof(smlSeqI_t));
00361         
00362         sarfile.flush();
00363         if(!sarfile.good()){
00364                 sarfile.clear();
00365                 Throw_gnExMsg( IOStreamFailed(), "Error writing sorted mer list to disk.\n");
00366         }
00367         // reopen the sorted mer list file read-only
00368         sarfile.close();
00369         sarfile.open(filename.c_str(), ios::binary | ios::in );
00370         if(!sarfile.is_open())
00371                 Throw_gnExMsg( FileNotOpened(), "FileSML::Create: Error opening sorted mer list file.\n");
00372 
00373         sardata.open(filename);
00374 }
00375 
00376 bmer FileSML::operator[](gnSeqI index)
00377 {
00378         bmer tmp_mer;
00379         tmp_mer.position = base()[index];
00380         tmp_mer.mer = GetSeedMer(tmp_mer.position);
00381         return tmp_mer;
00382 }
00383 
00384 
00385 boolean FileSML::Read(vector<bmer>& readVector, gnSeqI size, const gnSeqI offset)
00386 {
00387         if(!sarfile.is_open()){
00388                 DebugMsg("FileSML::Read: Error sar file not open.\n");
00389                 return false;
00390         }
00391 
00392         gnSeqI total_len = SMLLength();
00393         if(offset >= total_len){
00394                 readVector.clear();
00395                 return false;
00396         }
00397         gnSeqI readlen = offset + size < total_len ? size : total_len - offset;
00398         
00399         readVector.resize( readlen );
00400 
00401         //copy data to the vector
00402         for(gnSeqI j=0; j < readlen; j++){
00403                 bmer tmp_mer;
00404                 tmp_mer.position = base()[offset+j];
00405                 if( tmp_mer.position > header.length ){
00406                         string errmsg = "Corrupted SML, position ";
00407                         errmsg += tmp_mer.position + " is out of range\n";
00408                         ErrorMsg( errmsg );
00409                         cerr << errmsg;
00410                 }else
00411                         tmp_mer.mer = GetSeedMer(tmp_mer.position);
00412                 readVector[ j ] = tmp_mer;
00413         }
00414         return true;
00415 }
00416 
00417 void FileSML::BigCreate(const gnSequence& seq, const uint32 split_levels, const uint32 mersize){
00418 //      unsigned long freemem = wxGetFreeMemory();      //get the amount of free memory.
00419 //      unsigned long neededmem = GetNeededMemory(seq.length());
00420 //      if(neededmem >= freemem && neededmem > MEMORY_MINIMUM){ // divide and conquer
00421         if(split_levels > 0){   // split_levels defines the number of times to divide and conquer
00422                 uint64 midpoint = seq.length() / 2;
00423                 midpoint = (midpoint * header.alphabet_bits) / 32;
00424                 midpoint = (midpoint / header.alphabet_bits) * 32;
00425                 gnSequence seqA = seq.subseq(1, midpoint);
00426                 gnSequence seqB = seq.subseq(1 + midpoint, seq.length() - midpoint);
00427                 seqA.setCircular(false);
00428                 seqB.setCircular(false);
00429                 cout << "Splitting " << seq.length() << " to " << seqA.length() << " and " << seqB.length() << "\n";
00430 
00431                 //create the first sar
00432                 string temp_sarfile = CreateTempFileName("bdsa_split");
00433                 FileSML* temp_sar = this->Clone();
00434                 temp_sar->filename = temp_sarfile.c_str();
00435                 temp_sar->BigCreate(seqA, split_levels - 1, mersize);
00436 
00437                 //create the second sar
00438                 string temp_sarfile2 = CreateTempFileName("bdsa_split");
00439                 FileSML* temp_sar2 = this->Clone();
00440                 temp_sar2->filename = temp_sarfile2.c_str();
00441                 temp_sar2->BigCreate(seqB, split_levels - 1, mersize);
00442 
00443                 //merge them to this file
00444                 cout << "Merging " << seqA.length() << " and " << seqB.length() << "\n";
00445                 Merge(*temp_sar, *temp_sar2);
00446                 //free up RAM
00447                 delete temp_sar;
00448                 delete temp_sar2;
00449                 //erase the temp files.
00450                 boost::filesystem::remove(temp_sarfile);
00451                 boost::filesystem::remove(temp_sarfile2);
00452         }else{
00453                 Create(seq, mersize);
00454         }
00455 }
00456 
00457 void FileSML::RadixSort(vector<bmer>& s_array){
00458         vector<bmer> *source_buckets;
00459         vector<bmer> *tmp_buckets;
00460         vector<bmer> *buckets;
00461         uint32 radix_size = 11;
00462         uint64 radix_mask = 0xFFFFFFFF;
00463         radix_mask <<= 32;
00464         radix_mask |= 0xFFFFFFFF;
00465         radix_mask >>= 64 - radix_size;
00466         
00467         uint32 bucket_count = (uint32) pow((double)2, (double)radix_size);
00468         uint32 cur_shift_bits = 0;
00469         buckets = new vector<bmer>[bucket_count];
00470         source_buckets = new vector<bmer>[bucket_count];
00471         uint64 cur_bucket;
00472         for(uint32 merI = 0; merI < s_array.size(); merI++){
00473                 cur_bucket = s_array[merI].mer & radix_mask;
00474                 source_buckets[cur_bucket].push_back(s_array[merI]);
00475         }
00476         s_array.clear();
00477         cur_shift_bits += radix_size;
00478         radix_mask <<= radix_size;
00479         while(cur_shift_bits < 64){
00480                 for(uint32 bucketI = 0; bucketI < bucket_count; bucketI++){
00481                         for(uint32 merI = 0; merI < source_buckets[bucketI].size(); merI++){
00482                                 cur_bucket = source_buckets[bucketI][merI].mer & radix_mask;
00483                                 cur_bucket >>= cur_shift_bits;
00484                                 buckets[cur_bucket].push_back(source_buckets[bucketI][merI]);
00485                         }
00486                         source_buckets[bucketI].clear();
00487                 }
00488                 cur_shift_bits += radix_size;
00489                 radix_mask <<= radix_size;
00490                 tmp_buckets = source_buckets;
00491                 source_buckets = buckets;
00492                 buckets = tmp_buckets;
00493         }
00494         s_array.clear();
00495         for(uint32 bucketI = 0; bucketI < bucket_count; bucketI++){
00496                 for(uint32 merI = 0; merI < source_buckets[bucketI].size(); merI++){
00497                         s_array.push_back(source_buckets[bucketI][merI]);
00498                 }
00499                 source_buckets[bucketI].clear();
00500         }
00501         delete[] source_buckets;
00502         delete[] buckets;
00503 }
00504 
00505 //Merges the supplied sorted mer lists into this one, overwriting the existing sml.
00506 //KNOWN BUG:  The first sorted mer list must have (length * alphabet_bits) / word_bits == 0
00507 //for Merge to work properly.
00508 void FileSML::Merge(SortedMerList& sa, SortedMerList& sa2){
00509 STACK_TRACE_START
00510         SMLHeader sa_head = sa.GetHeader();
00511         SMLHeader sa_head2 = sa2.GetHeader();
00512         
00513         //basic copying
00514         header = sa_head;
00515         //take the smaller mer_size
00516         if(sa_head.seed_length < sa_head2.seed_length){
00517                 header.seed_length = sa_head.seed_length;
00518                 mer_mask = sa.GetMerMask();
00519         }else{
00520                 header.seed_length = sa_head2.seed_length;
00521                 mer_mask = sa2.GetMerMask();
00522         }
00523         header.unique_mers = NO_UNIQUE_COUNT;
00524         header.length += sa_head2.length;
00525 
00526         //allocate some memory
00527         const uint32 SEQ_BUFFER_SIZE = 200000;
00528         Array<uint32> seq_buf ( SEQ_BUFFER_SIZE + header.seed_length );
00529 
00530         //do some sanity checks on the sars we're merging.
00531         if(sa_head.alphabet_bits != sa_head2.alphabet_bits ||
00532           sa_head.version != sa_head2.version ||
00533           memcmp(sa_head.translation_table, sa_head2.translation_table, UINT8_MAX)){
00534                 Throw_gnExMsg(SMLMergeError(), "Incompatible sorted mer lists.");
00535         }
00536         
00537         OpenForWriting( true );
00538 
00539         //write the header
00540         sarfile.write((char*)&header, sizeof(struct SMLHeader));
00541         if(!sarfile.good()){
00542                 sarfile.clear();
00543                 sarfile.close();
00544                 sarfile.open(filename.c_str(), ios::binary | ios::in );
00545                 Throw_gnExMsg(IOStreamFailed(), "Error writing sorted mer list header to disk.");
00546         }
00547 
00548         //copy sequence data into memory.
00549         uint32 binary_seq_len = (header.length * header.alphabet_bits) / 32;
00550         if((header.length * header.alphabet_bits) % 32 > 0)
00551                 binary_seq_len++;
00552 
00553         //The +1 is to avoid access violations when copying in the
00554         //binary sequence before shifting.
00555         if( sequence != NULL )
00556                 delete[] sequence;
00557         sequence = new uint32[binary_seq_len+1];
00558         sa.GetBSequence(sequence, sa_head.length, 0);
00559 
00560         uint32 bseq_len1 = (sa_head.length * sa_head.alphabet_bits) / 32;
00561         uint32 bseq_remainder = (sa_head.length * sa_head.alphabet_bits) % 32;
00562         if(bseq_remainder > 0){
00563                 sa2.GetBSequence(&(sequence[bseq_len1]), sa_head2.length, 0);
00564                 //mask off the end of the first sequence
00565                 uint32 end_mask = 0xFFFFFFFF;
00566                 end_mask <<= bseq_remainder;
00567                 sequence[bseq_len1] &= end_mask;
00568 
00569                 //shift the second sequence over.
00570                 for(uint32 i=bseq_len1; i < binary_seq_len; i++){
00571                         uint32 tmp = sequence[i+1];
00572                         tmp >>= 32 - bseq_remainder;
00573                         sequence[i] |= tmp;
00574                         sequence[i+1] <<= bseq_remainder;
00575                 }
00576         }else
00577                 sa2.GetBSequence(&(sequence[bseq_len1]), sa_head2.length, 0);
00578         
00579         //write the sequence
00580         sarfile.write((char*)sequence, binary_seq_len * sizeof(uint32));
00581         sarray_start_offset = sarfile.tellg();
00582 
00583         //get new mers in the middle
00584         vector<bmer> middle_mers;
00585         bmer mid_mer;
00586         for(uint32 midI = sa_head.length - header.seed_length + 1; midI < sa_head.length; midI++){
00587                 mid_mer.position = midI;
00588                 mid_mer.mer = GetMer(midI);
00589                 middle_mers.push_back(mid_mer);
00590         }
00591         sort(middle_mers.begin(), middle_mers.end(), &bmer_lessthan);
00592         //put a special mer at the end which will never go into the sorted mer list
00593         //since every possible mer is less than it.
00594         mid_mer.mer = 0xFFFFFFFF;
00595         mid_mer.mer <<= 32;
00596         mid_mer.mer |= 0xFFFFFFFF;
00597         mid_mer.position = GNSEQI_END;
00598         middle_mers.push_back(mid_mer);
00599         //merge and write the sorted mer lists
00600         vector<bmer> array1, array2;
00601         uint32 SAR_BUFFER_SIZE = SEQ_BUFFER_SIZE/2;  //actual size is this number * 13 bytes
00602         uint32 k=0, l=0, midI=0;
00603         uint32 m = 0, n = 0;
00604         gnSeqI bufferI=0;
00605         do{
00606                 //mergesort them
00607                 while(m < array1.size() && n < array2.size()){
00608                         if(array1[m].mer <= array2[n].mer){
00609                                 if(array1[m].mer <= middle_mers[midI].mer){
00610                                         seq_buf.data[bufferI] = array1[m].position;
00611                                         m++;
00612                                         bufferI++;
00613                                 }else{
00614                                         seq_buf.data[bufferI] = middle_mers[midI].position;
00615                                         midI++;
00616                                         bufferI++;
00617                                 }
00618                         }else if(array2[n].mer <= middle_mers[midI].mer){
00619                                 seq_buf.data[bufferI] = array2[n].position + sa_head.length;
00620                                 n++;
00621                                 bufferI++;
00622                         }else{
00623                                 seq_buf.data[bufferI] = middle_mers[midI].position;
00624                                 midI++;
00625                                 bufferI++;
00626                         }
00627                         if(bufferI == SEQ_BUFFER_SIZE){
00628                                 sarfile.write((char*)seq_buf.data, bufferI * sizeof(uint32));
00629                                 bufferI = 0;
00630                         }
00631                 }
00632                 if(m == array1.size()){
00633                         sa.Read(array1, SAR_BUFFER_SIZE, k);
00634                         k += array1.size();
00635                         m = 0;
00636                 }
00637                 if(n == array2.size()){
00638                         sa2.Read(array2, SAR_BUFFER_SIZE, l);
00639                         l += array2.size();
00640                         n = 0;
00641                 }
00642         }while(array1.size() != 0 && array2.size() != 0);
00643         if(bufferI > 0)
00644                 sarfile.write((char*)seq_buf.data, (bufferI)*sizeof(uint32));
00645         //consolidate the remaining mers to a known vector
00646         vector<bmer> remaining_mers;
00647         for(;m < array1.size(); m++)
00648                 remaining_mers.push_back(array1[m]);
00649         for(;n < array2.size(); n++){
00650                 remaining_mers.push_back(array2[n]);
00651                 remaining_mers[remaining_mers.size()-1].position += sa_head.length;
00652         }
00653         for(;midI < middle_mers.size() - 1; midI++)
00654                 remaining_mers.push_back(middle_mers[midI]);
00655         //merge them with the remaining middle_mers
00656         sort(remaining_mers.begin(), remaining_mers.end(), &bmer_lessthan);
00657         uint32 remI = 0;
00658         for(;remI < remaining_mers.size(); remI++)
00659                 seq_buf.data[remI] = remaining_mers[remI].position;
00660         if(remI > 0)
00661                 sarfile.write((char*)seq_buf.data, (remI)*sizeof(uint32));
00662 
00663         if(!sarfile.good()){
00664                 sarfile.clear();
00665                 sarfile.close();
00666                 sarfile.open(filename.c_str(), ios::binary | ios::in );
00667                 Throw_gnExMsg(IOStreamFailed(), "Error writing position array.");
00668         }
00669         // reopen the sorted mer list file read-only
00670         sarfile.close();
00671         sarfile.open(filename.c_str(), ios::binary | ios::in );
00672         if(!sarfile.is_open()){
00673                 sarfile.clear();
00674                 Throw_gnExMsg(FileNotOpened(), "Error opening sorted mer list file.");
00675         }
00676 STACK_TRACE_END
00677 }
00678 
00679 }

Generated on Fri Mar 14 06:01:02 2008 for libMems by doxygen 1.3.6