src/gnFASSource.cpp

Go to the documentation of this file.
00001 
00002 // File:            gnFASSource.h
00003 // Purpose:         Implements gnBaseSource for .FAS files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00012 #include "gn/gnFASSource.h"
00013 #include "gn/gnBaseSpec.h"
00014 #include "gn/gnFilter.h"
00015 #include "gn/gnStringTools.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnDebug.h"
00019 
00020 gnFASSource::gnFASSource()
00021 {
00022         m_openString = "";
00023         m_pFilter = gnFilter::proteinSeqFilter();
00024         if(m_pFilter == NULL){
00025                 DebugMsg("Error using static sequence filters.");
00026         }
00027 }
00028 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00029 {
00030         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00031         for( ; iter != s.m_contigList.end(); ++iter )
00032         {
00033                 m_contigList.push_back( (*iter)->Clone() );
00034         }
00035 }
00036 gnFASSource::~gnFASSource()
00037 {
00038         m_ifstream.close();
00039         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00040         for( ; iter != m_contigList.end(); ++iter )
00041         {
00042                 gnFileContig* fg = *iter;
00043                 *iter = 0;
00044                 delete fg;
00045         }
00046 }
00047 boolean gnFASSource::HasContig( const string& nameStr ) const
00048 {
00049         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00050         {
00051                 if( nameStr == m_contigList[i]->GetName() )
00052                         return true;
00053         }
00054         return false;
00055 }
00056 uint32 gnFASSource::GetContigID( const string& name ) const
00057 {
00058         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00059         {
00060                 if( name == m_contigList[i]->GetName() )
00061                         return i;
00062         }
00063         return ALL_CONTIGS;
00064 }
00065 string gnFASSource::GetContigName( const uint32 i ) const
00066 {
00067         if( i < m_contigList.size() ){
00068                 return m_contigList[i]->GetName();
00069         }
00070         return "";
00071 }
00072 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00073 {
00074         if( i < m_contigList.size() ){
00075                 return m_contigList[i]->GetSeqLength();
00076         }else if( i == ALL_CONTIGS){
00077                 gnSeqI seqlen = 0;
00078                 for(uint32 j=0; j < m_contigList.size(); j++)
00079                         seqlen += m_contigList[j]->GetSeqLength();
00080                 return seqlen;
00081         }
00082         return GNSEQI_ERROR;
00083 }
00084 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00085 {
00086         if( i < m_contigList.size() ){
00087                 return m_contigList[i];
00088         }
00089         return 0;
00090 }
00091 
00092 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ) 
00093 {
00094         m_ifstream.clear();
00095         uint32 contigIndex = contigI;
00096         uint64 startPos = 0;
00097         uint64 readableBytes = 0;
00098         if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00099                 bufLen = 0;
00100                 return false;
00101         }
00102         
00103         if( contigI == ALL_CONTIGS ){
00104                 uint32 curLen = 0;
00105                 uint64 bytesRead = 0;
00106                 while (curLen < bufLen){
00107 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00108                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00109                                 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00110                                         bufLen = curLen;
00111                                         return true;
00112                                 }
00113                         //readLen is the amount to read on this pass
00114                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00115                         Array<gnSeqC> array_buf( readLen );
00116                         gnSeqC* tmpBuf = array_buf.data;        //read into tmpBuf, then filter tmpBuf into curBuf
00117 
00118                         // read chars and filter
00119                         m_ifstream.read(tmpBuf, readLen);
00120                         uint64 gc = m_ifstream.gcount();
00121                         bytesRead += gc;
00122                         readableBytes -= gc;
00123 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00124 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00125                         for(uint32 i=0; i < gc; i++){
00126                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127                                         buf[curLen] = tmpBuf[i];
00128                                         curLen++;
00129                                 }
00130                         }
00131                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00132                                 m_ifstream.clear();
00133                                 bufLen = curLen;
00134                                 return true;
00135                         }
00136                 }
00137                 bufLen = curLen;
00138         }
00139         else if( contigI < m_contigList.size() )
00140         {
00141                 uint32 curLen = 0;
00142                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00143                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00144                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00145                 while (curLen < bufLen)
00146                 {
00147                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00148                         Array<gnSeqC> array_buf( readLen );     // use Array template to ensure proper deallocation
00149                         gnSeqC* tmpBuf = array_buf.data;        //read into tmpBuf, then filter tmpBuf into curBuf
00150 
00151                         // read chars and filter
00152                         m_ifstream.read(tmpBuf, readLen);
00153                         uint64 gc = m_ifstream.gcount();
00154 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00155 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00156                         for(uint32 i=0; i < gc; i++){
00157                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158                                         buf[curLen] = tmpBuf[i];
00159                                         curLen++;
00160                                 }
00161                         }
00162                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00163                                 m_ifstream.clear();
00164                                 bufLen = curLen;
00165                                 return true;
00166                         }
00167                 }
00168                 bufLen = curLen;
00169         }
00170         return true;
00171 }
00172 
00173 
00174 // private:
00175 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00176 // returns startPos, the file offset where the sequence starts
00177 // returns true if successful, false otherwise
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180         if( contigI == ALL_CONTIGS ){
00181                 // find first contig
00182                 gnSeqI curIndex = 0;
00183                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184                 for( ; iter != m_contigList.end(); ++iter )
00185                 {
00186                         uint64 len = (*iter)->GetSeqLength();
00187                         if( (curIndex + len) > start )
00188                                 break;
00189                         curIndex += len;
00190                 }
00191                 if( iter == m_contigList.end() )
00192                         return false;
00193                 // seek to start
00194                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00195                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196         }
00197         else if( contigI < m_contigList.size() )
00198         {
00199                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200         }
00201         return false;
00202 }
00203 
00204 //Returns startPos, the file offset where the sequence starts.
00205 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00206         readableBytes = 0;
00207         uint32 curLen = 0;
00208         //seek to the file offset where the contig starts
00209         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00210 
00211         if(contig.HasRepeatSeqGap())
00212                 if(contig.GetRepeatSeqGapSize().first > 0)
00213                         if(contig.GetRepeatSeqGapSize().second > 0){
00214 //check this    
00215                                 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00216                                 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00217                                 m_ifstream.seekg( startPos, ios::beg );
00218                                 return true;
00219                         }
00220         m_ifstream.seekg( startPos, ios::beg );
00221         if( m_ifstream.eof() ){
00222                 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00223                 return false;
00224         }
00225         while( true ){
00226                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00227                   // startPos will contain the file offset with the starting base pair
00228                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00229                 if(tmpbufsize == 0){
00230                         ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00231                         return false;
00232                 }
00233                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00234                 Array<char> array_buf( tmpbufsize );    // use Array template to ensure proper deallocation
00235                 char *tmpbuf = array_buf.data;
00236                 m_ifstream.read( tmpbuf, tmpbufsize );
00237                 if( m_ifstream.eof() ){
00238                         ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00239                         return false;
00240                 }
00241                 for( uint32 i=0; i < tmpbufsize; ++i ){
00242                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00243                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00244                                         startPos += i;
00245                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00246                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00247                                         return true;
00248                                 }
00249                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00250                         }
00251                 }
00252                 startPos += tmpbufsize;
00253         }
00254         return true;
00255 }
00256 
00257 //write out the given source in this file format.
00258 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00259         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00260         if(!m_ofstream.is_open())
00261                 return false;
00262         uint32 contigCount = source->GetContigListLength();
00263         for(uint32 contigI = 0; contigI < contigCount; contigI++){
00264                 string contigName = source->GetContigName(contigI);
00265                 m_ofstream << ">" << contigName << ";\n";
00266                 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00267                 while(seqLength > 0){
00268                         gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;            
00269                         Array<gnSeqC> array_buf( writeLen+1 );  // use Array template to ensure proper deallocation
00270                         gnSeqC *bases = array_buf.data;
00271                         boolean success = source->SeqRead(0, bases, writeLen, contigI);
00272                         if(!success)
00273                                 return false;
00274                         bases[writeLen] = '\0';
00275                         m_ofstream << bases << "\n";
00276                         seqLength -= writeLen;
00277                 }
00278         }
00279         m_ofstream.close();
00280         return true;
00281 }
00282 
00283 //write out the given sequence in this file format.
00284 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00285         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00286         if(!m_ofstream.is_open())
00287                 Throw_gnEx(FileNotOpened());
00288         Write(seq, m_ofstream, write_coords, enforce_unique_names);
00289         m_ofstream.close();
00290 }
00291 
00292 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){
00293         vector<string> contigNameList;
00294         Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation
00295         gnSeqC *bases = array_buf.data;
00296         gnGenomeSpec* spec = seq.GetSpec();
00297         
00298         //set the newline type.
00299         string newline = "\r\n";
00300         gnSeqI readOffset = 1;
00301 
00302         for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00303                 //write out contig name and header
00304                 string contigName = seq.contigName(fragI);
00305 
00306                 if(enforce_unique_names){
00307                         uint32 name_count = 0;
00308                         for(uint32 i=0; i < contigNameList.size(); i++)
00309                                 if(contigNameList[i] == contigName)
00310                                         name_count++;
00311                         contigNameList.push_back(contigName);
00312                         if(name_count > 0)
00313                                 contigName += "_" + uintToString(name_count);
00314                 }
00315 
00316                 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00317                 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00318                 string header = "";
00319                 if(gpbh != NULL){
00320                         header = gpbh->GetHeader();
00321                         //delete everything after the first newline.
00322                         uint32 newlinepos = header.find_first_of('\n', 0);
00323                         if(newlinepos != string::npos){
00324                                 if(header[newlinepos-1] == '\r')
00325                                         newlinepos--;
00326                                 header = header.substr(0, newlinepos);
00327                         }
00328                 }       //IMPLEMENT ME!! Does header line need to be < 80 chars?
00329 
00330                 //write out the sequence
00331                 gnSeqI readLength = seq.contigLength(fragI);
00332                 m_ostream << ">" << contigName;
00333                 if(write_coords)
00334                         m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00335                 m_ostream << "; " << header << newline;
00336 
00337                 gnSeqI linePos = 0;
00338                 while(readLength > 0){  //buffer the read/writes
00339                         gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00340                         gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00341                         
00342                         if(!seq.ToArray(bases, writeLen, readOffset))
00343                                 return;
00344                         for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00345                                 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00346                                 m_ostream << string(bases + curbase, writeout_size) << newline;
00347                         }
00348                         readLength -= writeLen;
00349                         readOffset += writeLen;
00350                 }
00351                 if(linePos != 0)
00352                         m_ostream << newline;
00353         }
00354 
00355 }
00356 
00357 gnGenomeSpec *gnFASSource::GetSpec() const{
00358         gnGenomeSpec *spec = new gnGenomeSpec();
00359         for(uint32 i=0; i < m_contigList.size(); i++){
00360                 //create specs for the fragment and contig levels
00361                 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00362                 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00363                 //set up the spec tree structure
00364                 spec->AddSpec(fragmentSpec, i);
00365                 fragmentSpec->AddSpec(contigSpec);
00366 
00367                 fragmentSpec->SetName(m_contigList[i]->GetName());
00368                 fragmentSpec->SetSourceName(m_openString);
00369                 contigSpec->SetName(m_contigList[i]->GetName());
00370                 contigSpec->SetSourceName(m_openString);
00371                 
00372                 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00373                 if(headsect.first != headsect.second){
00374                         gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00375                         fragmentSpec->AddHeader(gpsh, 0);
00376                 }
00377         }
00378         return spec;
00379 }
00380 
00381 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00382         if(m_contigList.size() > contigI)
00383                 return m_contigList[contigI];
00384         return NULL;
00385 }
00386 
00387 boolean gnFASSource::ParseStream( istream& fin )
00388 {
00389         // INIT temp varables
00390         uint32 readState = 0;
00391         gnFileContig* currentContig = 0;
00392         string nameFStr;
00393         uint64 seqLength = 0, gapLength = 0;
00394         // INIT buffer
00395         uint64 streamPos = 0;
00396         uint64 bufReadLen = 0;
00397         Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation
00398         char* buf = array_buf.data;
00399         boolean paren_hit = false;
00400         uint32 repeatSeqSize = 0;
00401         boolean corrupt_msg = false;
00402 
00403         //decide what type of newlines we have
00404         DetermineNewlineType();
00405 
00406         
00407         while( !fin.eof() )
00408         {
00409                   // read chars
00410                 fin.read( buf, BUFFER_SIZE);
00411                 streamPos += bufReadLen;
00412                 bufReadLen = fin.gcount();
00413                 for( uint32 i=0 ; i < bufReadLen ; i++ )
00414                 {
00415                         char ch = buf[i];
00416                         switch( readState )
00417                         {
00418                                 case 0: // CHECK file validity
00419                                         if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00420                                         {
00421                                                 return false; // NOT a .fas file
00422                                         }
00423                                         readState = 1;
00424                                 case 1: // BRANCH
00425                                         if( ch == '>' )
00426                                         {
00427                                                 seqLength = 0; gapLength = 0; // reset count
00428                                                 currentContig = new gnFileContig();
00429                                                 currentContig->SetFileStart( streamPos + i );
00430                                                 currentContig->SetRepeatSeqGap(true);
00431                                                 currentContig->SetRepeatSeqSize( repeatSeqSize );
00432                                                 currentContig->SetRepeatGapSize( m_newlineSize );
00433                                                 readState = 2;
00434                                                 paren_hit = false;
00435                                                 nameFStr = "";
00436                                         }
00437                                         else
00438                                                 ++gapLength;
00439                                         break;
00440                                 case 2: // > CONTIG NAME
00441                                         if( isNewLine(ch) || ch == ';')
00442                                         {
00443                                                 currentContig->SetName( nameFStr );
00444                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00445                                                 if( ch == ';' )
00446                                                         readState = 3;
00447                                                 else{
00448                                                         readState = 4;
00449                                                         if(ch == '\r')
00450                                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00451                                                 }
00452                                         }
00453                                         else if( ch == '(' ){
00454                                                 if(isSpace(buf[i-1])){
00455                                                         //delete the last char in nameFStr
00456                                                         nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00457                                                 }
00458                                                 paren_hit = true;
00459                                         }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00460                                                 nameFStr += ch;
00461                                         break;
00462                                 case 3: // >... ; REMARK
00463                                         if( isNewLine(ch) )
00464                                                 readState = 4;
00465                                         break;
00466                                 case 4: // >... ; /n NEWLINE BRANCH
00467                                         if( ch == '>' )
00468                                         {
00469                                                 readState = 3;
00470                                         }
00471                                         else if( m_pFilter->IsValid(ch) )
00472                                         {
00473                                                 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00474                                                 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00475                                                 seqLength = 1; gapLength = 0;
00476                                                 readState = 5;
00477                                         }
00478                                         break;
00479                                 case 5: // SEQUENCE
00480                                         while(i < bufReadLen){
00481                                                 ch = buf[i];
00482                                                 if( m_pFilter->IsValid(ch) ){
00483                                                         if(gapLength > 0){
00484                                                                 if(seqLength != repeatSeqSize){
00485                                                                         if( !corrupt_msg ){
00486                                                                                 corrupt_msg = true;
00487                                                                                 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00488                                                                         }
00489                                                                         currentContig->SetRepeatSeqGap(false);
00490                                                                 }if(gapLength != m_newlineSize){
00491                                                                         //file is corrupt, can't do jumps.
00492                                                                         if( !corrupt_msg ){
00493                                                                                 corrupt_msg = true;
00494                                                                                 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00495                                                                         }
00496                                                                         currentContig->SetRepeatSeqGap(false);
00497                                                                 }
00498                                                                 currentContig->AddToSeqLength( seqLength );
00499                                                                 seqLength = 0;
00500                                                                 gapLength = 0;
00501                                                         }
00502                                                         seqLength++;
00503                                                 }else if( ch == '>'){
00504                                                         currentContig->AddToSeqLength( seqLength );
00505                                                         currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00506                                                         currentContig->SetFileEnd( streamPos + i - 1 );
00507                                                         m_contigList.push_back(currentContig);
00508                                                         readState = 1;
00509                                                         i--;
00510                                                         break;
00511                                                 }
00512                                                 else if( isNewLine(ch) ){
00513                                                         if( repeatSeqSize == 0 ){
00514                                                                 repeatSeqSize = seqLength;
00515                                                                 currentContig->SetRepeatSeqSize( repeatSeqSize );
00516                                                         }
00517                                                         gapLength++;
00518                                                 }
00519                                                 else{
00520                                                         // Error cannot do nice jumps
00521                                                         currentContig->SetRepeatSeqGap(false);
00522                                                         if( !corrupt_msg ){
00523                                                                 corrupt_msg = true;
00524                                                                 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00525                                                         }
00526                                                 }
00527                                                 i++;
00528                                         }
00529                                         break;
00530                                 default:
00531                                         ErrorMsg("ERROR");
00532                                         return false;
00533                                         break;
00534                         }
00535                 }// for all buf
00536         }// while !eof
00537         // CLEAN UP
00538         if( currentContig != 0 )
00539         {
00540                 if( readState == 2 )
00541                 {
00542                         currentContig->SetName( nameFStr );
00543                 }
00544                 if( (readState >= 2) && (readState < 5) )
00545                 {
00546                         currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00547                 }
00548                 else if( readState ==  5 )
00549                 {
00550                         currentContig->AddToSeqLength( seqLength );
00551                         currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00552                 }                       
00553                 currentContig->SetFileEnd( streamPos + bufReadLen );
00554                 m_contigList.push_back(currentContig);
00555         }
00556         m_ifstream.clear();
00557         return true;
00558 }

Generated on Mon Mar 28 06:00:21 2005 for libGenome by doxygen 1.3.6