src/gnSEQSource.cpp

Go to the documentation of this file.
00001 
00002 // File:            gnSEQSource.h
00003 // Purpose:         Implements gnBaseSource for .SEQ files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00012 #include "gn/gnFilter.h"
00013 #include "gn/gnFeature.h"
00014 #include "gn/gnSEQSource.h"
00015 #include "gn/gnGBKSource.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnSourceQualifier.h"
00019 #include "gn/gnLocation.h"
00020 #include "gn/gnStringTools.h"
00021 #include "gn/gnDebug.h"
00022 
00023 gnSEQSource::gnSEQSource()
00024 {
00025         m_openString = "";
00026         m_pFilter = gnFilter::proteinSeqFilter();
00027         if(m_pFilter == NULL){
00028                 DebugMsg("Error using static sequence filters.\n");
00029         }
00030 }
00031 gnSEQSource::gnSEQSource( const gnSEQSource& s ) : gnFileSource(s)
00032 {
00033         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00034         for( ; iter != s.m_contigList.end(); ++iter )
00035         {
00036                 m_contigList.push_back( (*iter)->Clone() );
00037         }
00038 }
00039 gnSEQSource::~gnSEQSource()
00040 {
00041         m_ifstream.close();
00042         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00043         for( ; iter != m_contigList.end(); ++iter )
00044         {
00045                 gnFileContig* fg = *iter;
00046                 *iter = 0;
00047                 delete fg;
00048         }
00049 }
00050 boolean gnSEQSource::HasContig( const string& name ) const
00051 {
00052         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00053         {
00054                 if( name == m_contigList[i]->GetName() )
00055                         return true;
00056         }
00057         return false;
00058 }
00059 uint32 gnSEQSource::GetContigID( const string& name ) const
00060 {
00061         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00062         {
00063                 if( name == m_contigList[i]->GetName() )
00064                         return i;
00065         }
00066         return ALL_CONTIGS;
00067 }
00068 string gnSEQSource::GetContigName( const uint32 i ) const
00069 {
00070         if( i < m_contigList.size() )
00071         {
00072                 return m_contigList[i]->GetName();
00073         }
00074         return "";
00075 }
00076 gnSeqI gnSEQSource::GetContigSeqLength( const uint32 i ) const
00077 {
00078         if( i == ALL_CONTIGS)
00079                 return m_spec->GetLength();
00080         if( i < m_contigList.size() )
00081         {
00082                 return m_contigList[i]->GetSeqLength();
00083         }
00084         return GNSEQI_ERROR;
00085 }
00086 
00087 boolean gnSEQSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){
00088         uint64 startPos = 0;
00089         uint64 readableBytes = 0;
00090         if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00091         {
00092                 bufLen = 0;
00093                 return false;
00094         }
00095         
00096         if( contigI == ALL_CONTIGS )
00097         {
00098                 uint32 curLen = 0;
00099                 uint64 bytesRead = 0;
00100                 while (curLen < bufLen)
00101                 {
00102 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00103                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00104                                 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00105                                         bufLen = curLen;
00106                                         return true;
00107                                 }
00108                         //readLen is the amount to read on this pass
00109                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 
00110                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00111 
00112                         // read chars and filter
00113                         m_ifstream.read(tmpBuf, readLen);
00114                         uint64 gc = m_ifstream.gcount();
00115                         bytesRead += gc;
00116                         readableBytes -= gc;
00117                         for(uint32 i=0; i < gc; i++){
00118                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00119                                         buf[curLen] = tmpBuf[i];
00120                                         curLen++;
00121                                 }
00122                         }
00123                         delete[] tmpBuf;
00124                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00125                                 m_ifstream.clear();
00126                                 bufLen = curLen;
00127                                 return true;
00128                         }
00129                 }
00130                 bufLen = curLen;
00131         }
00132         else if( contigI < m_contigList.size() )
00133         {
00134                 uint32 curLen = 0;
00135                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00136                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00137                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00138                 while (curLen < bufLen)
00139                 {
00140                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00141                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00142 
00143                         // read chars and filter
00144                         m_ifstream.read(tmpBuf, readLen);
00145                         uint64 gc = m_ifstream.gcount();
00146 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00147 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00148                         for(uint32 i=0; i < gc; i++){
00149                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00150                                         buf[curLen] = tmpBuf[i];
00151                                         curLen++;
00152                                 }
00153                         }
00154                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00155                                 m_ifstream.clear();
00156                                 bufLen = curLen;
00157                                 return true;
00158                         }
00159                         delete[] tmpBuf;
00160                 }
00161                 bufLen = curLen;
00162         }
00163         return true;
00164 
00165 }
00166 // private:
00167 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00168 // returns startPos, the file offset where the sequence starts
00169 // returns true if successful, false otherwise
00170 boolean gnSEQSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00171 {
00172         if( contigI == ALL_CONTIGS )
00173         {
00174                 // find first contig
00175                 gnSeqI curIndex = 0;
00176                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00177                 for( ; iter != m_contigList.end(); ++iter )
00178                 {
00179                         uint64 len = (*iter)->GetSeqLength();
00180                         if( (curIndex + len) > start )
00181                                 break;
00182                         curIndex += len;
00183                 }
00184                 if( iter == m_contigList.end() )
00185                         return false;
00186                 // seek to start
00187                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00188                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00189         }
00190         else if( contigI < m_contigList.size() )
00191         {
00192                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00193         }
00194         return false;
00195 }
00196 //Returns startPos, the file offset where the sequence starts.
00197 boolean gnSEQSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00198 {
00199         readableBytes = 0;
00200         uint32 curLen = 0;
00201         //seek to the file offset where the contig starts
00202         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00203         m_ifstream.seekg( startPos, ios::beg );
00204         if( m_ifstream.eof() ){
00205                 DebugMsg("ERROR in gnSEQSource::Incorrect contig start position, End of file reached!\n");
00206                 return false;
00207         }
00208         while( true )
00209         {
00210                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00211                   // startPos will contain the file offset with the starting base pair
00212                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00213                 if(tmpbufsize == 0){
00214                         DebugMsg("ERROR in gnSEQSource: stored contig size is incorrect.");
00215                         return false;
00216                 }
00217                 uint64 startOffset = start;
00218                 if(contig.HasRepeatSeqGap()){
00219                         if(contig.GetRepeatSeqGapSize().first > 0){
00220                                 if(contig.GetRepeatSeqGapSize().second > 0){
00221                                         startOffset += (start*contig.GetRepeatSeqGapSize().second)/contig.GetRepeatSeqGapSize().first;
00222                                         startPos+=startOffset;
00223                                         m_ifstream.seekg(startPos , ios::beg);
00224                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00225                                         return true;
00226                                 }
00227                         }else{
00228                                 startPos+=start;
00229                                 m_ifstream.seekg(startPos , ios::beg);
00230                                 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00231                                 return true;
00232                         }
00233                 }
00234                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00235                 char *tmpbuf = new char[tmpbufsize];
00236                 m_ifstream.read( tmpbuf, tmpbufsize );
00237                 if( m_ifstream.eof() ){
00238                         ErrorMsg("ERROR in gnSEQSource::Read End of file reached!\n");
00239                         delete[] tmpbuf;
00240                         return false;
00241                 }
00242                 for( uint32 i=0; i < tmpbufsize; ++i ){
00243                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00244                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00245                                         startPos += i;
00246                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00247                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00248                                         delete[] tmpbuf;
00249                                         return true;
00250                                 }
00251                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00252                         }
00253                 }
00254                 startPos += tmpbufsize;
00255                 delete[] tmpbuf;
00256         }
00257         return true;
00258 }
00259 
00260 
00261 //IMPLEMENT ME!  move these static methods somewhere else!  especially basecount!
00262 void gnSEQSource::BaseCount(const string& bases, gnSeqI& a_count, gnSeqI& c_count, gnSeqI& g_count, gnSeqI& t_count, gnSeqI& other_count){
00263         a_count = 0;
00264         c_count = 0;
00265         g_count = 0;
00266         t_count = 0;
00267         other_count = 0;
00268         for(uint32 i=0; i < bases.length(); i++){
00269                 if((bases[i] == 'a')||(bases[i] == 'A'))
00270                         a_count++;
00271                 else if((bases[i] == 'c')||(bases[i] == 'C'))
00272                         c_count++;
00273                 else if((bases[i] == 'g')||(bases[i] == 'G'))
00274                         g_count++;
00275                 else if((bases[i] == 't')||(bases[i] == 'T'))
00276                         t_count++;
00277                 else
00278                         other_count++;
00279         }
00280 }
00281 
00282 void gnSEQSource::FormatString(string& data, uint32 offset, uint32 width){
00283         //first remove newlines and corresponding whitespace
00284         string::size_type newline_loc = data.find_first_of('\n', 0);
00285         while(newline_loc != string::npos){
00286                 if(data[newline_loc-1] == '\r')
00287                         newline_loc--;
00288                 string::size_type text_loc = newline_loc;
00289                 while((data[text_loc] == ' ') ||(data[text_loc] == '    ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00290                         text_loc++;
00291                         if(text_loc+1 == data.length())
00292                                 break;
00293                 }
00294                 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00295                 newline_loc = data.find_first_of('\n', 0);
00296         }
00297         //now reformat with newlines and whitespace, observing word boundaries...
00298         string output_string = "";
00299         for(uint32 charI = 0; charI < data.length();){
00300                 //get the substring to append and increment charI
00301                 string::size_type base_loc = charI;
00302                 string append_string;
00303                 while(base_loc - charI <= width){
00304                         string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00305                         if(space_loc - charI < width)
00306                                 base_loc = space_loc;
00307                         else if(base_loc == charI){
00308                                 //word is too big for one line.  split it.
00309                                 append_string = data.substr(charI, width);
00310                                 charI+=width;
00311                         }else{
00312                                 append_string = data.substr(charI, base_loc - charI);
00313                                 charI = base_loc;
00314                         }
00315                 }
00316                 output_string += string(offset, ' ') + append_string;
00317                 if(charI + width < data.length())
00318                         output_string += "\r\n";
00319         }
00320         data = output_string;
00321 }
00322 
00323 boolean gnSEQSource::Write(gnGenomeSpec *spec, const string& filename){
00324         ErrorMsg("Writing DNAStar SEQ files is not supported at this time.  Try again next week.\n");
00325         return false;
00326 }
00327 
00328 gnFileContig* gnSEQSource::GetFileContig( const uint32 contigI ) const{
00329         if(m_contigList.size() > contigI)
00330                 return m_contigList[contigI];
00331         return NULL;
00332 }
00333 
00334 //File parsing access routine
00335 boolean gnSEQSource::ParseStream( istream& fin )
00336 {
00337         // INIT temp varables
00338         uint32 readState = 0;
00339         uint32 lineStart = 0;
00340         int64 gapstart = -1;
00341         // INIT buffer
00342         uint32 sectionStart = 0;
00343         uint64 streamPos = 0;
00344         uint64 bufReadLen = 0;
00345         uint64 remainingBuffer = 0;
00346         char* buf = new char[BUFFER_SIZE];
00347         gnFragmentSpec* curFrag = 0;
00348         gnSourceSpec* curSpec = 0;
00349         gnSourceHeader *curHeader;
00350         gnBaseFeature* curFeature;
00351         gnFileContig* curContig = 0;
00352         gnLocation::gnLocationType curBaseLocationType;
00353         gnSeqI curLocationStart;
00354         int32 curStartLength = 0;
00355         int32 curEndLength = 0;
00356         string curLocContig = "";
00357         string curQualifierName;
00358         uint64 curQualifierStart;
00359         string curContigName = "";
00360         gnSeqI seqLength = 0;
00361         gnSeqI lineSeqSize = 0;
00362         
00363         m_spec = new gnGenomeSpec();
00364         while( !fin.eof() )
00365         {
00366                 if(sectionStart > 0){
00367                         if(readState == 15)
00368                                 sectionStart = bufReadLen;
00369                         else if(readState == 16)
00370                                 sectionStart = lineStart;
00371                         remainingBuffer = bufReadLen - sectionStart;
00372                         memmove(buf, buf+sectionStart, remainingBuffer);
00373                 }
00374                   // read chars
00375                 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00376                 streamPos -= remainingBuffer;
00377                 lineStart -= sectionStart;
00378                 if(gapstart > 0)
00379                         gapstart -= sectionStart;
00380                 sectionStart = 0;
00381                 bufReadLen = fin.gcount();
00382                 bufReadLen += remainingBuffer;
00383                 
00384                 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00385                 {
00386                         char ch = buf[i];
00387                         switch( readState )
00388                         {
00389                                 case 0:         //Assume we are in header at the start of a new line.  
00390                                                         //Look for keywords starting in column 1
00391                                         if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != '  ')){  //not equal to space or tab
00392                                                 if(curSpec == NULL){
00393                                                         curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00394                                                         curFrag = new gnFragmentSpec();
00395                                                         curFrag->AddSpec(curSpec);
00396                                                         curSpec->SetSourceName(m_openString);
00397                                                         m_spec->AddSpec(curFrag);
00398                                                 }
00399                                                 if(lineStart != sectionStart){  //Add the previous header to our list
00400                                                         uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00401                                                         for(; j > 0; j--)       
00402                                                                 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != '      '))
00403                                                                         break;
00404                                                         string header_name = string(buf+sectionStart, j+1);
00405                                                         curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00406                                                         //if this is header info _before_ a locus statement then its a general file header.
00407                                                         if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00408                                                                 m_spec->AddHeader(curHeader);
00409                                                         else    //otherwise its a fragment header.
00410                                                                 curFrag->AddHeader(curHeader);
00411                                                         sectionStart = lineStart;
00412                                                 }
00413                                                 
00414                                                 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00415                                                         sectionStart = i + 1;
00416                                                         readState = 1;  //read in features
00417                                                 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00418                                                         curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00419                                                         curFrag->AddHeader(curHeader);
00420                                                         curContig = new gnFileContig();
00421                                                         curContig->SetName(curContigName);
00422                                                         curContigName = "";
00423                                                         readState = 13;  //read in base pairs
00424                                                 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00425                                                         if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00426                                                                 curFrag->SetCircular(true);
00427                                                         uint32 j = SEQ_LOCUS_NAME_LENGTH + 1;
00428                                                         for(; j > 0; j--)       
00429                                                                 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00430                                                                         break;
00431                                                         curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00432                                                         curFrag->SetName(curContigName);
00433                                                 }else if(strncmp(&buf[lineStart], "^^", 2) == 0){
00434                                                         //start the sequence.
00435                                                         if(curContig == NULL){
00436                                                                 curContig = new gnFileContig();
00437                                                                 curContig->SetName(curContigName);
00438                                                                 curContigName = "";
00439                                                         }
00440                                                         i--;
00441                                                         readState = 14;
00442                                                         break;
00443                                                 }
00444                                         }
00445                                         if(ch == '\n')
00446                                                 lineStart = i + 1;
00447                                         break;
00448                                 case 1: //look for feature tag in column six.  ignore whitespace before feature.
00449                                         if((ch == ' ')||(ch == '        ')){
00450                                                 break;
00451                                         }else if(ch == '\n'){
00452                                                 lineStart = i + 1;
00453                                                 sectionStart = i + 1;
00454                                                 break;
00455                                         }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead
00456                                                 i--;
00457                                                 readState = 0; //Deal with a Header TAG
00458                                                 sectionStart = i + 1;
00459                                                 break;
00460                                         }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='      ')&&(i==lineStart+1))){
00461                                                 sectionStart = i;
00462                                                 readState = 2;
00463                                         } //
00464                                 case 2:  //Get the feature name.  stop on whitespace
00465                                         if((ch == ' ')||(ch == '        ')){
00466                                                 string featureName(buf+sectionStart, i - sectionStart);
00467                                                 curFeature = new gnFeature(featureName);
00468                                                 curFrag->AddFeature(curFeature);
00469                                                 sectionStart = i + 1;
00470                                                 readState = 3;
00471                                         }
00472                                         break;
00473                                 case 3:   //Ignore whitespace before feature location
00474                                         if((ch == ' ')||(ch == '        ')){
00475                                                 break;
00476                                         }else if((ch == '\r')||(ch == '\n')){
00477                                                 lineStart = i+1;
00478                                                 break;
00479                                         }
00480                                         sectionStart = i;
00481                                         readState = 4;
00482                                 case 4:         //Read a location start.  stop on (<.:^ and whitespace
00483                                         if((ch == ' ')||(ch == '        ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00484                                                 string starter(buf+sectionStart, i - sectionStart);
00485                                                 if(ch == '('){
00486                                                         if(starter == "complement")
00487                                                                 curFeature->SetLocationType(gnLocation::LT_Complement);
00488                                                         else if(starter == "order")
00489                                                                 curFeature->SetLocationType(gnLocation::LT_Order);
00490                                                         else if(starter == "group")
00491                                                                 curFeature->SetLocationType(gnLocation::LT_Group);
00492                                                         else if(starter == "one-of")
00493                                                                 curFeature->SetLocationType(gnLocation::LT_OneOf);
00494                                                         sectionStart = i + 1;   //ignore join since it is default.
00495                                                         break;
00496                                                 }else if(ch == ':'){
00497                                                         curLocContig = starter;
00498                                                         sectionStart = i + 1;
00499                                                         break;
00500                                                 }
00501                                                 curLocationStart = atoi(starter.c_str());
00502                                                 readState = 6;  //read in end base by default.
00503                                                 if(ch == '.'){
00504                                                         //go to special state to look for another one.
00505                                                         readState = 5;
00506                                                         sectionStart = i + 1;
00507                                                         break;
00508                                                 }else if(ch == '^'){
00509                                                         curBaseLocationType = gnLocation::LT_BetweenBases;
00510                                                 }else if((ch == ' ')||(ch == '  ')){
00511                                                         //no end location go to qualifier
00512                                                         gnLocation curLocation(curLocationStart, curLocationStart);
00513                                                         curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00514                                                         readState = 7;
00515                                                 }
00516                                                 sectionStart = i + 1;
00517 
00518                                         }else if(ch == '<'){
00519                                                 curStartLength = -1;
00520                                                 sectionStart = i + 1;
00521                                         }else if(ch == '>'){
00522                                                 curStartLength = 1;
00523                                                 sectionStart = i + 1;
00524                                         }
00525                                         break;
00526                                 case 5: //look for another period or location start.
00527                                         if(ch == '.'){
00528                                                 curBaseLocationType = gnLocation::LT_Standard;
00529                                                 readState = 6;
00530                                                 sectionStart = i + 1;
00531                                                 break;
00532                                         }
00533                                         curBaseLocationType = gnLocation::LT_OneOf;
00534                                 case 6: //see if there's a second location value.  stop on >, and whitespace
00535                                         if(ch == '>'){
00536                                                 curEndLength = 1;
00537                                                 sectionStart = i + 1;
00538                                         }else if(ch == '<'){
00539                                                 curEndLength = -1;
00540                                                 sectionStart = i + 1;
00541                                         }else if((ch == ' ')||(ch == '  ')||(ch == ',')){
00542                                                 //read end location
00543                                                 string ender(buf+sectionStart, i - sectionStart);
00544                                                 gnSeqI curLocationEnd = atoi(ender.c_str());
00545                                                 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00546                                                 curEndLength = 0;
00547                                                 curStartLength = 0;
00548                                                 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00549                                                 readState = ch == ',' ? 3 : 7;  //read another loc if we need to.
00550                                                 sectionStart = i+1;
00551                                         }
00552                                         break;
00553                                 case 7:  //skip to start of qualifier
00554                                         if((ch != ' ')&&(ch != '        ')&&(lineStart == i)){
00555                                                 sectionStart = i;       // Hit a tag.  go deal with it.
00556                                                 readState = 0;
00557                                                 i--;
00558                                         }else if((ch != ' ')&&(ch != '  ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='  ')&&(i==lineStart+1)))){
00559                                                 sectionStart = i;       // Hit a feature.  go deal with it.
00560                                                 readState = 2;
00561                                                 i--;
00562                                         }else if(ch == ','){  //oops!  another location to read!
00563                                                 sectionStart = i+1;
00564                                                 readState = 3;
00565                                         }else if(ch == '/'){  //finally, a qualifier.
00566                                                 sectionStart = i+1;
00567                                                 readState = 8;
00568                                         }else if(ch == '\n')
00569                                                 lineStart = i + 1;
00570                                         break;
00571                                 case 8:         //get a qualifier, stop on =
00572                                         if(ch == '='){
00573                                                 curQualifierName = string(buf+sectionStart, i - sectionStart);
00574                                                 readState = 9;
00575                                                 sectionStart = i+1;
00576                                         }
00577                                         break;
00578                                 case 9:         //are we getting a string? look for " or [
00579                                         if(ch == '"'){
00580                                                 readState = 10;
00581                                                 sectionStart = i;
00582                                                 curQualifierStart = i + streamPos;
00583                                         }else if(ch == '['){
00584                                                 readState = 11;
00585                                                 sectionStart = i;
00586                                         }else if((ch == '\r')||(ch == '\n')){
00587                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00588                                                 sectionStart = i+1;
00589                                                 readState = 7; //look for another qualifier
00590                                         }
00591                                         break;
00592                                 case 10:                //read until the end of the quotation. look out for escaped quotes
00593                                         if(ch == '"')
00594                                                 readState = 11;
00595                                         if(ch == '\n'){
00596                                                 lineStart = i + 1;
00597                                         }
00598                                         break;
00599                                 case 11:
00600                                         if(ch != '"'){
00601                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00602                                                 sectionStart = i+1;
00603                                                 readState = 7;  //look for another qualifier.
00604                                                 if(ch == '\n')
00605                                                         lineStart = i + 1;
00606                                         }else
00607                                                 readState = 10;  //quote was escaped.  look for another.
00608                                         break;
00609                                 case 12:
00610                                         if(ch == ']'){
00611                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00612                                                 sectionStart = i+1;
00613                                                 readState = 7;  //look for another qualifier.
00614                                         }
00615                                         break;
00616                                 case 13:        //start the sequence read.
00617                                         if(ch == '^')   //stupid blattlab file format.
00618                                                 readState = 14;
00619                                         else{
00620                                                 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos);
00621                                                 readState = 16;
00622                                         }
00623                                         break;
00624                                 case 14:        //wait for newline before sequence starts.
00625                                         if(ch == '\n'){
00626                                                 curContig->SetRepeatSeqGap(true);
00627                                                 lineStart = i + 1;
00628                                                 sectionStart = i + 1;
00629                                                 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos);
00630                                                 readState = 15;
00631                                         }
00632                                         break;
00633                                 case 15:
00634                                         if(m_pFilter->IsValid(ch))
00635                                                 seqLength++;
00636                                         else
00637                                                 curContig->SetRepeatSeqGap(false);
00638                                         break;
00639                                 case 16:
00640                                         if((ch == '/')&&(i==lineStart)){
00641                                                 readState = 17;
00642                                         }else if(m_pFilter->IsValid(ch)){
00643                                                 seqLength++;
00644                                                 lineSeqSize++;
00645                                                 if(gapstart >= 0){
00646                                                         curContig->SetRepeatGapSize(i - gapstart);
00647                                                         gapstart = -1;
00648                                                 }
00649                                         }else if(ch == '\n'){   //IMPLEMENT ME! Needs consistent gap size checking
00650                                                 if(sectionStart == lineStart){
00651                                                         curContig->SetRepeatSeqGap(true);
00652                                                         curContig->SetRepeatSeqSize(seqLength);
00653                                                         gapstart = i;
00654                                                         for(; gapstart >= lineStart; gapstart--)
00655                                                                 if(m_pFilter->IsValid(buf[gapstart]))
00656                                                                         break;
00657                                                         gapstart++;
00658                                                 }else if(lineSeqSize != curContig->GetRepeatSeqGapSize().first)
00659                                                         curContig->SetRepeatSeqGap(false);
00660                                                 lineSeqSize = 0;
00661                                                 lineStart = i + 1;
00662                                         }
00663                                         break;
00664                                 case 17:
00665                                         if((ch == '\n')&&(buf[lineStart+1] == '/')){
00666                                                 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos);
00667                                                 curContig->SetSeqLength(seqLength);
00668                                                 m_contigList.push_back(curContig);
00669                                                 curContig = 0;
00670                                                 curSpec->SetLength(seqLength);
00671                                                 curSpec = 0;
00672                                                 seqLength = 0;
00673                                                 lineStart = i + 1;
00674                                                 sectionStart = i + 1;
00675                                                 readState = 0;
00676                                         }
00677                                         break;
00678                         }
00679                 }
00680                 streamPos += bufReadLen;
00681         }
00682         if(curContig != 0){
00683                 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00684                 curContig->SetSeqLength(seqLength);
00685                 m_contigList.push_back(curContig);
00686                 curSpec->SetLength(seqLength);
00687         }
00688         if(curSpec != NULL)
00689                 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00690                         &&(curSpec->GetLength() == 0)){
00691                         m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00692                         delete curFrag;
00693                 }
00694         m_ifstream.clear();
00695         delete[] buf;
00696         return true;
00697 }

Generated on Mon Mar 28 06:00:21 2005 for libGenome by doxygen 1.3.6