src/gnGBKSource.cpp

Go to the documentation of this file.
00001 
00002 // File:            gnGBKSource.h
00003 // Purpose:         Implements gnBaseSource for GenBank sequences
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00012 #include "gn/gnFilter.h"
00013 #include "gn/gnFeature.h"
00014 #include "gn/gnGBKSource.h"
00015 #include "gn/gnSourceSpec.h"
00016 #include "gn/gnSourceHeader.h"
00017 #include "gn/gnSourceQualifier.h"
00018 #include "gn/gnLocation.h"
00019 #include "gn/gnStringTools.h"
00020 #include "gn/gnDebug.h"
00021 #include "gn/gnStringQualifier.h"
00022 #include <string>
00023 
00024 gnGBKSource::gnGBKSource()
00025 {
00026         m_openString = "";
00027         m_pFilter = gnFilter::proteinSeqFilter();
00028         if(m_pFilter == NULL){
00029                 DebugMsg("Error using static sequence filters.");
00030         }
00031 }
00032 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s)
00033 {
00034         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00035         for( ; iter != s.m_contigList.end(); ++iter )
00036         {
00037                 m_contigList.push_back( (*iter)->Clone() );
00038         }
00039 }
00040 gnGBKSource::~gnGBKSource()
00041 {
00042         m_ifstream.close();
00043         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00044         for( ; iter != m_contigList.end(); ++iter )
00045         {
00046                 gnFileContig* fg = *iter;
00047                 *iter = 0;
00048                 delete fg;
00049         }
00050 }
00051 boolean gnGBKSource::HasContig( const string& name ) const
00052 {
00053         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00054         {
00055                 if( name == m_contigList[i]->GetName() )
00056                         return true;
00057         }
00058         return false;
00059 }
00060 uint32 gnGBKSource::GetContigID( const string& name ) const
00061 {
00062         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00063         {
00064                 if( name == m_contigList[i]->GetName() )
00065                         return i;
00066         }
00067         return ALL_CONTIGS;
00068 }
00069 string gnGBKSource::GetContigName( const uint32 i ) const
00070 {
00071         if( i < m_contigList.size() )
00072         {
00073                 return m_contigList[i]->GetName();
00074         }
00075         return "";
00076 }
00077 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const
00078 {
00079         if( i == ALL_CONTIGS)
00080                 return m_spec->GetLength();
00081         if( i < m_contigList.size() )
00082         {
00083                 return m_contigList[i]->GetSeqLength();
00084         }
00085         return GNSEQI_ERROR;
00086 }
00087 
00088 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){
00089         uint64 startPos = 0;
00090         uint64 readableBytes = 0;
00091         if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00092         {
00093                 bufLen = 0;
00094                 return false;
00095         }
00096         
00097         if( contigI == ALL_CONTIGS )
00098         {
00099                 uint32 curLen = 0;
00100                 uint64 bytesRead = 0;
00101                 while (curLen < bufLen)
00102                 {
00103 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00104                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00105                                 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106                                         bufLen = curLen;
00107                                         return true;
00108                                 }
00109                         //readLen is the amount to read on this pass
00110                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 
00111                         Array<gnSeqC> array_buf( readLen );
00112                         gnSeqC* tmpBuf = array_buf.data;
00113 
00114                         // read chars and filter
00115                         m_ifstream.read(tmpBuf, readLen);
00116                         uint64 gc = m_ifstream.gcount();
00117                         bytesRead += gc;
00118                         readableBytes -= gc;
00119                         for(uint32 i=0; i < gc; i++){
00120                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00121                                         buf[curLen] = tmpBuf[i];
00122                                         curLen++;
00123                                 }
00124                         }
00125                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00126                                 m_ifstream.clear();
00127                                 bufLen = curLen;
00128                                 return true;
00129                         }
00130                 }
00131                 bufLen = curLen;
00132         }
00133         else if( contigI < m_contigList.size() )
00134         {
00135                 uint32 curLen = 0;
00136                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00137                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00138                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139                 while (curLen < bufLen)
00140                 {
00141                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00142                         Array<gnSeqC> array_buf( readLen );
00143                         gnSeqC* tmpBuf = array_buf.data;
00144 
00145                         // read chars and filter
00146                         m_ifstream.read(tmpBuf, readLen);
00147                         uint64 gc = m_ifstream.gcount();
00148 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00149 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00150                         for(uint32 i=0; i < gc; i++){
00151                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00152                                         buf[curLen] = tmpBuf[i];
00153                                         curLen++;
00154                                 }
00155                         }
00156                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00157                                 m_ifstream.clear();
00158                                 bufLen = curLen;
00159                                 return true;
00160                         }
00161                 }
00162                 bufLen = curLen;
00163         }
00164         return true;
00165 
00166 }
00167 // private:
00168 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00169 // returns startPos, the file offset where the sequence starts
00170 // returns true if successful, false otherwise
00171 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00172 {
00173         if( contigI == ALL_CONTIGS )
00174         {
00175                 // find first contig
00176                 gnSeqI curIndex = 0;
00177                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00178                 for( ; iter != m_contigList.end(); ++iter )
00179                 {
00180                         uint64 len = (*iter)->GetSeqLength();
00181                         if( (curIndex + len) > start )
00182                                 break;
00183                         curIndex += len;
00184                 }
00185                 if( iter == m_contigList.end() )
00186                         return false;
00187                 // seek to start
00188                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00189                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190         }
00191         else if( contigI < m_contigList.size() )
00192         {
00193                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00194         }
00195         return false;
00196 }
00197 //Returns startPos, the file offset where the sequence starts.
00198 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00199 {
00200         readableBytes = 0;
00201         uint32 curLen = 0;
00202         //seek to the file offset where the contig starts
00203         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00204         m_ifstream.seekg( startPos, ios::beg );
00205         if( m_ifstream.eof() ){
00206                 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00207                 return false;
00208         }
00209         while( true )
00210         {
00211                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00212                   // startPos will contain the file offset with the starting base pair
00213                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00214                 if(tmpbufsize == 0){
00215                         ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect.");
00216                         return false;
00217                 }
00218                 uint64 startOffset = start;
00219                 if(contig.HasRepeatSeqGap()){   //check for sequence integrity
00220                         startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1;
00221                         if( m_newlineSize == 2 )        // test compensation for strange bug
00222                                 startOffset--;
00223                         startPos+=startOffset;
00224                         m_ifstream.seekg(startPos , ios::beg);
00225                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00226                         return true;
00227                 }
00228 
00229                 //sequence is corrupt.  read in base by base
00230                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00231                 Array<char> array_buf( tmpbufsize );
00232                 char* tmpbuf = array_buf.data;
00233 
00234                 m_ifstream.read( tmpbuf, tmpbufsize );
00235                 if( m_ifstream.eof() ){
00236                         ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n");
00237                         return false;
00238                 }
00239                 for( uint32 i=0; i < tmpbufsize; ++i ){
00240                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00241                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00242                                         startPos += i;
00243                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00244                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00245                                         return true;
00246                                 }
00247                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00248                         }
00249                 }
00250                 startPos += tmpbufsize;
00251         }
00252         return true;
00253 }
00254 
00255 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){
00256         //first remove newlines and corresponding whitespace
00257         string::size_type newline_loc = data.find_first_of('\n', 0);
00258         while(newline_loc != string::npos){
00259                 if(data[newline_loc-1] == '\r')
00260                         newline_loc--;
00261                 string::size_type text_loc = newline_loc;
00262                 while((data[text_loc] == ' ') ||(data[text_loc] == '    ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00263                         text_loc++;
00264                         if(text_loc+1 == data.length())
00265                                 break;
00266                 }
00267                 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00268                 newline_loc = data.find_first_of('\n', 0);
00269         }
00270         //now reformat with newlines and whitespace, observing word boundaries...
00271         string output_string = "";
00272         for(uint32 charI = 0; charI < data.length();){
00273                 //get the substring to append and increment charI
00274                 string::size_type base_loc = charI;
00275                 string append_string;
00276                 while(base_loc - charI <= width){
00277                         string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00278                         if(space_loc - charI < width)
00279                                 base_loc = space_loc;
00280                         else if(base_loc == charI){
00281                                 //word is too big for one line.  split it.
00282                                 append_string = data.substr(charI, width);
00283                                 charI+=width;
00284                         }else{
00285                                 append_string = data.substr(charI, base_loc - charI);
00286                                 charI = base_loc;
00287                         }
00288                 }
00289                 output_string += string(offset, ' ') + append_string;
00290                 if(charI + width < data.length())
00291                         output_string += "\r\n";
00292         }
00293         data = output_string;
00294 }
00295 
00296 template< class SubSpec >
00297 void WriteHeader(gnMultiSpec< SubSpec >* spec, const string& hdr, ofstream& m_ofstream) {
00298         gnBaseHeader* gpbh = NULL;
00299         uint32 header_index = 0;
00300         try{
00301                 do{
00302                         gpbh = spec->GetHeader(hdr, header_index);
00303                         m_ofstream << gpbh->GetHeader();
00304                         header_index++;
00305                 }while(gpbh != NULL);
00306         }catch(gnException& gne){}
00307 }
00308 
00309 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){
00310         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00311         if(!m_ofstream.is_open())
00312                 return false;
00313 
00314         string newline = "\r\n";
00315         gnGenomeSpec* spec = seq.GetSpec();
00316 
00317         // output general file header first if one exists.
00318         if(spec->GetHeaderListLength() == 1){
00319                 gnBaseHeader *gpbh = spec->GetHeader(0);
00320                 string name = gpbh->GetHeaderName();
00321                 //IMPLEMENT ME!  Is platform specific newline substitution necessary?
00322                 if(string::npos != name.find(".SEQ")){
00323                         string header = gpbh->GetHeader();
00324                         m_ofstream << header;
00325                 }
00326         }
00327         // TODO:  Figure out where the buffer overflow is and reduce
00328         // this back to BUFFER_SIZE -- overflow appears not to corrupt
00329         // sequence data
00330         Array<gnSeqC> array_buf( 2 * BUFFER_SIZE );
00331         gnSeqC *bases = array_buf.data;
00332 
00333         for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){
00334                 gnFragmentSpec* subSpec = spec->GetSpec(specI);
00335                 
00336                 //write out contig headers.  start with LOCUS...
00337                 m_ofstream << "LOCUS       ";
00338                 //write Locus Name
00339                 string contigName = subSpec->GetName();
00340                 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH)
00341                         contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH);
00342                 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length();
00343                 m_ofstream << contigName << string(filler_size, ' ');
00344                 //write Locus Length
00345                 string length_string = uintToString(subSpec->GetLength());
00346                 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00347                 m_ofstream << string(filler_size, ' ') << length_string << " bp ";
00348                 //write dnatype
00349                 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' ');
00350                 uint32 head_look_i = 0;
00351                 gnBaseHeader* gpbh = NULL;
00352                 try{
00353                         gpbh = subSpec->GetHeader("LOCUS", head_look_i);
00354                 }catch(gnException& gne){}
00355                 if( gpbh != NULL )
00356                         dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH);
00357                 m_ofstream << dnatype << string(2, ' ');
00358                 //write circularity
00359                 string circular = subSpec->IsCircular() ? string("circular  ") : string(10, ' ');
00360                 m_ofstream << circular;
00361                 //write division code
00362                 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' ');
00363                 if(gpbh != NULL)
00364                         division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH);
00365                 m_ofstream << division;
00366                 //write date -- IMPLEMENT ME!  format the real date to spec! dd-mmm-yyyy
00367                 string date = string(SEQ_LOCUS_DATE_LENGTH, ' ');
00368                 if(gpbh != NULL)
00369                         date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH);
00370                 m_ofstream << string(7, ' ') << date << "\r\n";
00371                 
00372                 //write out the rest of the headers if they were supplied!
00373                 WriteHeader(subSpec, "DEFINITION", m_ofstream);
00374                 WriteHeader(subSpec, "ACCESSION", m_ofstream);
00375                 WriteHeader(subSpec, "VERSION", m_ofstream);
00376                 WriteHeader(subSpec, "KEYWORDS", m_ofstream);
00377                 WriteHeader(subSpec, "SEGMENT", m_ofstream);
00378                 WriteHeader(subSpec, "SOURCE", m_ofstream);
00379                 WriteHeader(subSpec, "REFERENCE", m_ofstream);
00380                 WriteHeader(subSpec, "COMMENT", m_ofstream);
00381 
00382                 //write out feature table!
00383                 m_ofstream << "FEATURES             Location/Qualifiers" << "\r\n";
00384                 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){
00385                         //write a feature tag
00386                         gnBaseFeature *gpmf = subSpec->GetFeature(featureI);
00387                         string featureName = gpmf->GetName();
00388                         m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName;
00389                         m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' ');
00390                         //ready to output location b.s.
00391                         uint32 location_count = gpmf->GetLocationListLength();
00392                         uint32 line_pos = SEQ_FEATURE_LOC_OFFSET;
00393                         uint32 parenthesis_count = 0;
00394                         if(location_count > 1){
00395                                 m_ofstream << "join(";
00396                                 line_pos += 5;
00397                                 parenthesis_count++;
00398                         }
00399                         gnLocation::gnLocationType loc_type = gpmf->GetLocationType();
00400                         switch(loc_type){
00401                                 case gnLocation::LT_Standard:
00402                                         break;
00403                                 case gnLocation::LT_Complement:
00404                                         m_ofstream << "complement(";
00405                                         line_pos += 11;
00406                                         parenthesis_count++;
00407                                         break;
00408                                 case gnLocation::LT_Order:
00409                                         m_ofstream << "order(";
00410                                         line_pos += 6;
00411                                         parenthesis_count++;
00412                                         break;
00413                                 case gnLocation::LT_Group:
00414                                         m_ofstream << "group(";
00415                                         parenthesis_count++;
00416                                         line_pos += 6;
00417                                         break;
00418                                 case gnLocation::LT_OneOf:
00419                                         m_ofstream << "one-of(";
00420                                         parenthesis_count++;
00421                                         line_pos += 7;
00422                                         break;                          
00423                                 default:
00424                                         break;
00425                         }
00426                         //create the location string, then see if it will fit on the line
00427                         string location;
00428                         for(uint32 locationI = 0; locationI < location_count; locationI++){
00429                                 gnLocation gpl = gpmf->GetLocation(locationI);
00430                                 if(gpl.IsStartBoundLonger())
00431                                         location += ">";
00432                                 if(gpl.IsStartBoundShorter())
00433                                         location += "<";
00434                                 location += uintToString(gpl.GetStart());
00435                                 gnSeqI end_loc = gpl.GetEnd();
00436                                 if(end_loc != 0){
00437                                         switch(gpl.GetType()){
00438                                                 case gnLocation::LT_BetweenBases:
00439                                                         location += "^";
00440                                                         break;
00441                                                 case gnLocation::LT_OneOf:
00442                                                         location += ".";
00443                                                         break;
00444                                                 default:
00445                                                         location += "..";
00446                                                         break;
00447                                         }
00448                                         if(gpl.IsEndBoundShorter())
00449                                                 location += "<";
00450                                         if(gpl.IsEndBoundLonger())
00451                                                 location += ">";
00452                                         location+= uintToString(end_loc);
00453                                 }
00454                                 if(locationI +1 < location_count)
00455                                         location += ",";
00456                                 else{   //append necessary parenthesis
00457                                         for(;parenthesis_count > 0; parenthesis_count--)
00458                                                 location += ")";
00459                                 }
00460                                 //put it on this line if it fits.  otherwise make a new line.
00461                                 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){
00462                                         m_ofstream << location;
00463                                         line_pos += location.length();
00464                                 }else{
00465                                         m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location;
00466                                         line_pos = SEQ_FEATURE_LOC_OFFSET + location.length();
00467                                 }
00468                                 location = "";
00469                         }
00470                         m_ofstream << "\r\n";
00471                         //now output qualifiers!  yaay!
00472                         //god damn this is a big ugly piece of code.
00473                         
00474                         uint32 qualifier_count = gpmf->GetQualifierListLength();
00475                         for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00476                                 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' ');
00477                                 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI);
00478                                 m_ofstream << "/" << qualifier->GetName() << "=";
00479                                 //IMPLEMENT ME! do a better word wrap on this bitch.
00480                                 string qually = string(qualifier->GetValue());
00481 //                              FormatString(qually, SEQ_FEATURE_LOC_OFFSET, 80 - SEQ_FEATURE_LOC_OFFSET);
00482 //                              qually = qually.substr(SEQ_FEATURE_LOC_OFFSET);
00483                                 m_ofstream << qually << "\r\n";
00484                         }
00485                         if(gpmf != NULL)
00486                                 delete gpmf;
00487                 }
00488                 
00489                 //get information about the sequence we're writing out.
00490                 gnSeqI readOffset = seq.contigStart(specI);
00491                 gnSeqI readLength = seq.contigLength(specI);
00492 
00493                 //finally - output base count and origin
00494                 m_ofstream << "BASE COUNT ";
00495                 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00496                 gnSeqI countLen = readLength + readOffset;
00497                 for(gnSeqI countI = readOffset; countI < countLen;){
00498                         gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00499                         if(!seq.ToArray(bases, writeLen, countI))
00500                                 return false;
00501                         gnSeqI a, c, g, t, other;
00502                         BaseCount(string(bases, writeLen), a, c, g, t, other);
00503                         a_count += a;
00504                         c_count += c;
00505                         g_count += g;
00506                         t_count += t;
00507                         other_count += other;
00508                         countI += writeLen;
00509                 }
00510                 m_ofstream << uintToString(a_count) << " a ";
00511                 m_ofstream << uintToString(c_count) << " c ";
00512                 m_ofstream << uintToString(g_count) << " g ";
00513                 m_ofstream << uintToString(t_count) << " t ";
00514                 m_ofstream << uintToString(other_count) << " others" << "\r\n";
00515 
00516                 string origin = "ORIGIN\r\n";
00517                 head_look_i = 0;
00518                 try{
00519                         gpbh = subSpec->GetHeader("ORIGIN", head_look_i);
00520                         origin = gpbh->GetHeader();
00521                         m_ofstream << origin;
00522                 }catch(gnException& gne){
00523                         m_ofstream << "ORIGIN" << endl;
00524                 }
00525                 //write out the sequence
00526                 gnSeqI contig_bases = 0;
00527                 while(readLength > 0){  //buffer the read/writes
00528                         gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00529                         boolean success = seq.ToArray(bases, writeLen, readOffset);
00530                         if(!success)
00531                                 return false;
00532                         //print each 60 on their own lines...
00533                         for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00534                                 string baseIndexStr = uintToString(contig_bases + curbaseI +1);
00535                                 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' ');
00536                                 m_ofstream << baseIndexStr;
00537                                 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00538                                         if(writeLen <= curbaseI + base_offset)
00539                                                 break;
00540                                         int64 print_length = writeLen - (curbaseI + base_offset);
00541                                         print_length = print_length > 10 ? 10 : print_length;
00542                                         m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length);
00543                                 }
00544                                 m_ofstream << "\r\n";
00545                         }
00546                         readLength -= writeLen;
00547                         readOffset += writeLen;
00548                         contig_bases += writeLen;
00549                 }
00550                 m_ofstream << "//\r\n";
00551         }
00552         
00553         m_ofstream.close();
00554         return true;
00555 }
00556 
00557 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{
00558         if(m_contigList.size() > contigI)
00559                 return m_contigList[contigI];
00560         return NULL;
00561 }
00562 
00563 //File parsing access routine
00564 boolean gnGBKSource::ParseStream( istream& fin )
00565 {
00566         // INIT temp varables
00567         uint32 readState = 0;
00568         uint32 lineStart = 0;
00569         // INIT buffer
00570         uint32 sectionStart = 0;
00571         uint64 streamPos = 0;
00572         uint64 bufReadLen = 0;
00573         uint64 remainingBuffer = 0;
00574         Array<char> array_buf( BUFFER_SIZE );
00575         char* buf = array_buf.data;
00576         gnFragmentSpec* curFrag = 0;
00577         gnSourceSpec* curSpec = 0;
00578         gnSourceHeader *curHeader;
00579         gnFeature* curFeature;
00580         gnFileContig* curContig = 0;
00581         gnLocation::gnLocationType curBaseLocationType;
00582         gnSeqI curLocationStart;
00583         int32 curStartLength = 0;
00584         int32 curEndLength = 0;
00585         string curLocContig = "";
00586         string curQualifierName;
00587         uint64 curQualifierStart;
00588         string curContigName = "";
00589         gnSeqI seqLength = 0;
00590         gnSeqI seqChunk, seqChunkCount, gapChunk;
00591         boolean corruptWarning = false;
00592         
00593         //decide what type of newlines we have
00594         DetermineNewlineType();
00595 
00596         m_spec = new gnGenomeSpec();
00597         while( !fin.eof() )
00598         {
00599                 if(sectionStart > 0){
00600                         if(readState == 14)
00601                                 sectionStart = lineStart;
00602                         remainingBuffer = bufReadLen - sectionStart;
00603                         memmove(buf, buf+sectionStart, remainingBuffer);
00604                 }
00605                   // read chars
00606                 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00607                 streamPos -= remainingBuffer;
00608                 lineStart -= sectionStart;
00609                 sectionStart = 0;
00610                 bufReadLen = fin.gcount();
00611                 bufReadLen += remainingBuffer;
00612                 
00613                 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00614                 {
00615                         char ch = buf[i];
00616                         switch( readState )
00617                         {
00618                                 case 0:         //Assume we are in header at the start of a new line.  
00619                                                         //Look for keywords starting in column 1
00620                                         if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != '  ')){  //not equal to space or tab
00621                                                 if(curSpec == NULL){
00622                                                         curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00623                                                         curFrag = new gnFragmentSpec();
00624                                                         curFrag->AddSpec(curSpec);
00625                                                         curSpec->SetSourceName(m_openString);
00626                                                         m_spec->AddSpec(curFrag);
00627                                                 }
00628                                                 if(lineStart != sectionStart){  //Add the previous header to our list
00629                                                         uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00630                                                         for(; j > 0; j--)       
00631                                                                 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != '      '))
00632                                                                         break;
00633                                                         string header_name = string(buf+sectionStart, j+1);
00634                                                         curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00635                                                         //if this is header info _before_ a locus statement then its a general file header.
00636                                                         if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00637                                                                 m_spec->AddHeader(curHeader);
00638                                                         else    //otherwise its a fragment header.
00639                                                                 curFrag->AddHeader(curHeader);
00640                                                         sectionStart = lineStart;
00641                                                 }
00642                                                 
00643                                                 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00644                                                         sectionStart = i + 1;
00645                                                         readState = 1;  //read in features
00646                                                 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00647                                                         curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00648                                                         curFrag->AddHeader(curHeader);
00649                                                         curContig = new gnFileContig();
00650                                                         curContig->SetName(curContigName);
00651                                                         curContigName = "";
00652                                                         readState = 13;  //read in base pairs
00653                                                 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00654                                                         if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00655                                                                 curFrag->SetCircular(true);
00656                                                         uint32 j = SEQ_LOCUS_NAME_LENGTH+1;
00657                                                         for(; j > 0; j--)       
00658                                                                 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00659                                                                         break;
00660                                                         curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00661                                                         curFrag->SetName(curContigName);
00662                                                 }
00663                                         }
00664                                         if(ch == '\n'){
00665                                                 lineStart = i + 1;
00666                                         }
00667                                         break;
00668                                 case 1: //look for feature tag in column six.  ignore whitespace before feature.
00669                                         if((ch == ' ')||(ch == '        ')){
00670                                                 break;
00671                                         }else if(ch == '\n'){
00672                                                 lineStart = i + 1;
00673                                                 sectionStart = i + 1;
00674                                                 break;
00675                                         }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead
00676                                                 i--;
00677                                                 readState = 0; //Deal with a Header TAG
00678                                                 sectionStart = i + 1;
00679                                                 break;
00680                                         }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='      ')&&(i==lineStart+1))){
00681                                                 sectionStart = i;
00682                                                 readState = 2;
00683                                         } //
00684                                 case 2:  //Get the feature name.  stop on whitespace
00685                                         if((ch == ' ')||(ch == '        ')){
00686                                                 string featureName(buf+sectionStart, i - sectionStart);
00687                                                 curFeature = new gnFeature(featureName);
00688                                                 curFrag->AddFeature(curFeature);
00689                                                 sectionStart = i + 1;
00690                                                 readState = 3;
00691                                         }
00692                                         break;
00693                                 case 3:   //Ignore whitespace before feature location
00694                                         if((ch == ' ')||(ch == '        ')){
00695                                                 break;
00696                                         }else if((ch == '\r')||(ch == '\n')){
00697                                                 lineStart = i+1;
00698                                                 break;
00699                                         }
00700                                         sectionStart = i;
00701                                         readState = 4;
00702                                 // KNOWN BUG HERE!
00703                                 // if JOIN is outside a group of complemented coordinates the feature type
00704                                 // is incorrectly set to LT_COMPLEMENT.  Instead each location's type should be set to LT_COMPLEMENT 
00705                                 case 4:         //Read a location start.  stop on (<.:^ and whitespace
00706                                         if((ch == ' ')||(ch == '        ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00707                                                 string starter(buf+sectionStart, i - sectionStart);
00708                                                 if(ch == '('){
00709                                                         if(starter == "complement")
00710                                                                 curFeature->SetLocationType(gnLocation::LT_Complement);
00711                                                         else if(starter == "order")
00712                                                                 curFeature->SetLocationType(gnLocation::LT_Order);
00713                                                         else if(starter == "group")
00714                                                                 curFeature->SetLocationType(gnLocation::LT_Group);
00715                                                         else if(starter == "one-of")
00716                                                                 curFeature->SetLocationType(gnLocation::LT_OneOf);
00717                                                         sectionStart = i + 1;   //ignore join since it is default.
00718                                                         break;
00719                                                 }else if(ch == ':'){
00720                                                         curLocContig = starter;
00721                                                         sectionStart = i + 1;
00722                                                         break;
00723                                                 }
00724                                                 curLocationStart = atoi(starter.c_str());
00725                                                 readState = 6;  //read in end base by default.
00726                                                 if(ch == '.'){
00727                                                         //go to special state to look for another one.
00728                                                         readState = 5;
00729                                                         sectionStart = i + 1;
00730                                                         break;
00731                                                 }else if(ch == '^'){
00732                                                         curBaseLocationType = gnLocation::LT_BetweenBases;
00733                                                 }else if((ch == ' ')||(ch == '  ')){
00734                                                         //no end location go to qualifier
00735                                                         gnLocation curLocation(curLocationStart, curLocationStart);
00736                                                         curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00737                                                         readState = 7;
00738                                                 }
00739                                                 sectionStart = i + 1;
00740 
00741                                         }else if(ch == '<'){
00742                                                 curStartLength = -1;
00743                                                 sectionStart = i + 1;
00744                                         }else if(ch == '>'){
00745                                                 curStartLength = 1;
00746                                                 sectionStart = i + 1;
00747                                         }
00748                                         break;
00749                                 case 5: //look for another period or location start.
00750                                         if(ch == '.'){
00751                                                 curBaseLocationType = gnLocation::LT_Standard;
00752                                                 readState = 6;
00753                                                 sectionStart = i + 1;
00754                                                 break;
00755                                         }
00756                                         curBaseLocationType = gnLocation::LT_OneOf;
00757                                 case 6: //see if there's a second location value.  stop on >, and whitespace
00758                                         if(ch == '>'){
00759                                                 curEndLength = 1;
00760                                                 sectionStart = i + 1;
00761                                         }else if(ch == '<'){
00762                                                 curEndLength = -1;
00763                                                 sectionStart = i + 1;
00764                                         }else if((ch == ' ')||(ch == '  ')||(ch == ',')){
00765                                                 //read end location
00766                                                 string ender(buf+sectionStart, i - sectionStart);
00767                                                 gnSeqI curLocationEnd = atoi(ender.c_str());
00768                                                 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00769                                                 curEndLength = 0;
00770                                                 curStartLength = 0;
00771                                                 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00772                                                 readState = ch == ',' ? 3 : 7;  //read another loc if we need to.
00773                                                 sectionStart = i+1;
00774                                         }
00775                                         break;
00776                                 case 7:  //skip to start of qualifier
00777                                         if((ch != ' ')&&(ch != '        ')&&(lineStart == i)){
00778                                                 sectionStart = i;       // Hit a tag.  go deal with it.
00779                                                 readState = 0;
00780                                                 i--;
00781                                         }else if((ch != ' ')&&(ch != '  ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='  ')&&(i==lineStart+1)))){
00782                                                 sectionStart = i;       // Hit a feature.  go deal with it.
00783                                                 readState = 2;
00784                                                 i--;
00785                                         }else if(ch == ','){  //oops!  another location to read!
00786                                                 sectionStart = i+1;
00787                                                 readState = 3;
00788                                         }else if(ch == '/'){  //finally, a qualifier.
00789                                                 sectionStart = i+1;
00790                                                 readState = 8;
00791                                         }else if(ch == '\n')
00792                                                 lineStart = i + 1;
00793                                         break;
00794                                 case 8:         //get a qualifier, stop on =
00795                                         if(ch == '='){
00796                                                 curQualifierName = string(buf+sectionStart, i - sectionStart);
00797                                                 readState = 9;
00798                                                 sectionStart = i+1;
00799                                         }else if( ch == '\r' || ch == '\n' ){
00800                                                 // this is a value-less qualifier
00801                                                 curQualifierName = string(buf+sectionStart, i - sectionStart);
00802                                                 curFeature->AddQualifier( new gnStringQualifier( curQualifierName, "" ));
00803                                                 readState = 7;
00804                                                 sectionStart = i+1;
00805                                         }
00806                                         break;
00807                                 case 9:         //are we getting a string? look for " or [
00808                                         if(ch == '"'){
00809                                                 readState = 10;
00810                                                 sectionStart = i;
00811                                                 curQualifierStart = i + streamPos;
00812                                         }else if(ch == '['){
00813                                                 readState = 11;
00814                                                 sectionStart = i;
00815                                         }else if((ch == '\r')||(ch == '\n')){
00816                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00817                                                 sectionStart = i+1;
00818                                                 readState = 7; //look for another qualifier
00819                                         }
00820                                         break;
00821                                 case 10:                //read until the end of the quotation. look out for escaped quotes
00822                                         if(ch == '"')
00823                                                 readState = 11;
00824                                         if(ch == '\n'){
00825                                                 lineStart = i + 1;
00826                                         }
00827                                         break;
00828                                 case 11:
00829                                         if(ch != '"'){
00830                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00831                                                 sectionStart = i+1;
00832                                                 readState = 7;  //look for another qualifier.
00833                                                 if(ch == '\n')
00834                                                         lineStart = i + 1;
00835                                         }else
00836                                                 readState = 10;  //quote was escaped.  look for another.
00837                                         break;
00838                                 case 12:
00839                                         if(ch == ']'){
00840                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00841                                                 sectionStart = i+1;
00842                                                 readState = 7;  //look for another qualifier.
00843                                         }
00844                                         break;
00845                                 case 13:        //start the sequence read.
00846                                         curContig->SetSectStart(gnContigSequence, i - 1 + streamPos);
00847                                         curContig->SetRepeatSeqGap(true);
00848                                         seqChunk = 0;
00849                                         seqChunkCount = 0;
00850                                         gapChunk = m_newlineSize + 1;
00851                                         readState = 14;
00852                                         break;
00853                                 case 14:
00854                                         while(i < bufReadLen){
00855                                                 ch = buf[i];
00856                                                 if((ch == '/')&&(i==lineStart)){
00857                                                         readState = 15; //end of this sequence
00858                                                         break;
00859                                                 }else if(m_pFilter->IsValid(ch)){
00860                                                         if(gapChunk > 0){
00861                                                                 if((gapChunk > 1 && seqChunkCount > 0) ||
00862                                                                   (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){
00863                                                                         if( !corruptWarning ){
00864                                                                                 ErrorMsg("File is corrupt.  Proceed with caution.");
00865                                                                                 corruptWarning = true;
00866                                                                         }
00867                                                                         curContig->SetRepeatSeqGap(false);
00868                                                                 }
00869                                                                 gapChunk = 0;
00870                                                         }
00871                                                         seqChunk++;
00872                                                         seqLength++;
00873                                                 }else{
00874                                                         gapChunk++;
00875                                                         if(seqChunk == 10){
00876                                                                 seqChunk = 0;
00877                                                                 seqChunkCount++;
00878                                                                 if(seqChunkCount == 6){
00879                                                                         //got a complete line.  start over
00880                                                                         seqChunkCount = 0;
00881                                                                 }
00882                                                         }
00883                                                         if(ch == '\n')
00884                                                                 lineStart = i + 1;
00885                                                 }
00886                                                 i++;
00887                                         }
00888                                         break;
00889                                 case 15:
00890                                         if((ch == '\n')&&(buf[lineStart+1] == '/')){
00891                                                 curContig->SetSectEnd(gnContigSequence, lineStart - m_newlineSize + streamPos);
00892                                                 curContig->SetSeqLength(seqLength);
00893                                                 m_contigList.push_back(curContig);
00894                                                 curContig = 0;
00895                                                 curSpec->SetLength(seqLength);
00896                                                 curSpec = 0;
00897                                                 seqLength = 0;
00898                                                 lineStart = i + 1;
00899                                                 sectionStart = i + 1;
00900                                                 readState = 0;
00901                                         }
00902                                         break;
00903                         }
00904                 }
00905                 streamPos += bufReadLen;
00906         }
00907         if(curContig != 0){
00908                 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00909                 curContig->SetSeqLength(seqLength);
00910                 m_contigList.push_back(curContig);
00911                 curSpec->SetLength(seqLength);
00912         }
00913         if(curSpec != 0)
00914                 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00915                         &&(curSpec->GetLength() == 0)){
00916                         m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00917                         delete curFrag;
00918                 }
00919         m_ifstream.clear();
00920         return true;
00921 }

Generated on Mon Mar 28 06:00:21 2005 for libGenome by doxygen 1.3.6