00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00012 #include "gn/gnFilter.h"
00013 #include "gn/gnFeature.h"
00014 #include "gn/gnGBKSource.h"
00015 #include "gn/gnSourceSpec.h"
00016 #include "gn/gnSourceHeader.h"
00017 #include "gn/gnSourceQualifier.h"
00018 #include "gn/gnLocation.h"
00019 #include "gn/gnStringTools.h"
00020 #include "gn/gnDebug.h"
00021 #include "gn/gnStringQualifier.h"
00022 #include <string>
00023
00024 gnGBKSource::gnGBKSource()
00025 {
00026 m_openString = "";
00027 m_pFilter = gnFilter::proteinSeqFilter();
00028 if(m_pFilter == NULL){
00029 DebugMsg("Error using static sequence filters.");
00030 }
00031 }
00032 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s)
00033 {
00034 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00035 for( ; iter != s.m_contigList.end(); ++iter )
00036 {
00037 m_contigList.push_back( (*iter)->Clone() );
00038 }
00039 }
00040 gnGBKSource::~gnGBKSource()
00041 {
00042 m_ifstream.close();
00043 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00044 for( ; iter != m_contigList.end(); ++iter )
00045 {
00046 gnFileContig* fg = *iter;
00047 *iter = 0;
00048 delete fg;
00049 }
00050 }
00051 boolean gnGBKSource::HasContig( const string& name ) const
00052 {
00053 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00054 {
00055 if( name == m_contigList[i]->GetName() )
00056 return true;
00057 }
00058 return false;
00059 }
00060 uint32 gnGBKSource::GetContigID( const string& name ) const
00061 {
00062 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00063 {
00064 if( name == m_contigList[i]->GetName() )
00065 return i;
00066 }
00067 return ALL_CONTIGS;
00068 }
00069 string gnGBKSource::GetContigName( const uint32 i ) const
00070 {
00071 if( i < m_contigList.size() )
00072 {
00073 return m_contigList[i]->GetName();
00074 }
00075 return "";
00076 }
00077 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const
00078 {
00079 if( i == ALL_CONTIGS)
00080 return m_spec->GetLength();
00081 if( i < m_contigList.size() )
00082 {
00083 return m_contigList[i]->GetSeqLength();
00084 }
00085 return GNSEQI_ERROR;
00086 }
00087
00088 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){
00089 uint64 startPos = 0;
00090 uint64 readableBytes = 0;
00091 if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00092 {
00093 bufLen = 0;
00094 return false;
00095 }
00096
00097 if( contigI == ALL_CONTIGS )
00098 {
00099 uint32 curLen = 0;
00100 uint64 bytesRead = 0;
00101 while (curLen < bufLen)
00102 {
00103
00104 if(readableBytes <= 0)
00105 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106 bufLen = curLen;
00107 return true;
00108 }
00109
00110 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00111 Array<gnSeqC> array_buf( readLen );
00112 gnSeqC* tmpBuf = array_buf.data;
00113
00114
00115 m_ifstream.read(tmpBuf, readLen);
00116 uint64 gc = m_ifstream.gcount();
00117 bytesRead += gc;
00118 readableBytes -= gc;
00119 for(uint32 i=0; i < gc; i++){
00120 if( m_pFilter->IsValid(tmpBuf[i]) ){
00121 buf[curLen] = tmpBuf[i];
00122 curLen++;
00123 }
00124 }
00125 if(m_ifstream.eof()){
00126 m_ifstream.clear();
00127 bufLen = curLen;
00128 return true;
00129 }
00130 }
00131 bufLen = curLen;
00132 }
00133 else if( contigI < m_contigList.size() )
00134 {
00135 uint32 curLen = 0;
00136
00137 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00138 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139 while (curLen < bufLen)
00140 {
00141 uint64 readLen = bufLen - curLen;
00142 Array<gnSeqC> array_buf( readLen );
00143 gnSeqC* tmpBuf = array_buf.data;
00144
00145
00146 m_ifstream.read(tmpBuf, readLen);
00147 uint64 gc = m_ifstream.gcount();
00148
00149
00150 for(uint32 i=0; i < gc; i++){
00151 if( m_pFilter->IsValid(tmpBuf[i]) ){
00152 buf[curLen] = tmpBuf[i];
00153 curLen++;
00154 }
00155 }
00156 if(m_ifstream.eof()){
00157 m_ifstream.clear();
00158 bufLen = curLen;
00159 return true;
00160 }
00161 }
00162 bufLen = curLen;
00163 }
00164 return true;
00165
00166 }
00167
00168
00169
00170
00171 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00172 {
00173 if( contigI == ALL_CONTIGS )
00174 {
00175
00176 gnSeqI curIndex = 0;
00177 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00178 for( ; iter != m_contigList.end(); ++iter )
00179 {
00180 uint64 len = (*iter)->GetSeqLength();
00181 if( (curIndex + len) > start )
00182 break;
00183 curIndex += len;
00184 }
00185 if( iter == m_contigList.end() )
00186 return false;
00187
00188 gnSeqI startIndex = start - curIndex;
00189 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190 }
00191 else if( contigI < m_contigList.size() )
00192 {
00193 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00194 }
00195 return false;
00196 }
00197
00198 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00199 {
00200 readableBytes = 0;
00201 uint32 curLen = 0;
00202
00203 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00204 m_ifstream.seekg( startPos, ios::beg );
00205 if( m_ifstream.eof() ){
00206 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00207 return false;
00208 }
00209 while( true )
00210 {
00211
00212
00213 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00214 if(tmpbufsize == 0){
00215 ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect.");
00216 return false;
00217 }
00218 uint64 startOffset = start;
00219 if(contig.HasRepeatSeqGap()){
00220 startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1;
00221 if( m_newlineSize == 2 )
00222 startOffset--;
00223 startPos+=startOffset;
00224 m_ifstream.seekg(startPos , ios::beg);
00225 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00226 return true;
00227 }
00228
00229
00230 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00231 Array<char> array_buf( tmpbufsize );
00232 char* tmpbuf = array_buf.data;
00233
00234 m_ifstream.read( tmpbuf, tmpbufsize );
00235 if( m_ifstream.eof() ){
00236 ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n");
00237 return false;
00238 }
00239 for( uint32 i=0; i < tmpbufsize; ++i ){
00240 if( m_pFilter->IsValid(tmpbuf[i]) ){
00241 if( curLen >= start ){
00242 startPos += i;
00243 m_ifstream.seekg( startPos, ios::beg );
00244 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00245 return true;
00246 }
00247 ++curLen;
00248 }
00249 }
00250 startPos += tmpbufsize;
00251 }
00252 return true;
00253 }
00254
00255 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){
00256
00257 string::size_type newline_loc = data.find_first_of('\n', 0);
00258 while(newline_loc != string::npos){
00259 if(data[newline_loc-1] == '\r')
00260 newline_loc--;
00261 string::size_type text_loc = newline_loc;
00262 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00263 text_loc++;
00264 if(text_loc+1 == data.length())
00265 break;
00266 }
00267 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00268 newline_loc = data.find_first_of('\n', 0);
00269 }
00270
00271 string output_string = "";
00272 for(uint32 charI = 0; charI < data.length();){
00273
00274 string::size_type base_loc = charI;
00275 string append_string;
00276 while(base_loc - charI <= width){
00277 string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00278 if(space_loc - charI < width)
00279 base_loc = space_loc;
00280 else if(base_loc == charI){
00281
00282 append_string = data.substr(charI, width);
00283 charI+=width;
00284 }else{
00285 append_string = data.substr(charI, base_loc - charI);
00286 charI = base_loc;
00287 }
00288 }
00289 output_string += string(offset, ' ') + append_string;
00290 if(charI + width < data.length())
00291 output_string += "\r\n";
00292 }
00293 data = output_string;
00294 }
00295
00296 template< class SubSpec >
00297 void WriteHeader(gnMultiSpec< SubSpec >* spec, const string& hdr, ofstream& m_ofstream) {
00298 gnBaseHeader* gpbh = NULL;
00299 uint32 header_index = 0;
00300 try{
00301 do{
00302 gpbh = spec->GetHeader(hdr, header_index);
00303 m_ofstream << gpbh->GetHeader();
00304 header_index++;
00305 }while(gpbh != NULL);
00306 }catch(gnException& gne){}
00307 }
00308
00309 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){
00310 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00311 if(!m_ofstream.is_open())
00312 return false;
00313
00314 string newline = "\r\n";
00315 gnGenomeSpec* spec = seq.GetSpec();
00316
00317
00318 if(spec->GetHeaderListLength() == 1){
00319 gnBaseHeader *gpbh = spec->GetHeader(0);
00320 string name = gpbh->GetHeaderName();
00321
00322 if(string::npos != name.find(".SEQ")){
00323 string header = gpbh->GetHeader();
00324 m_ofstream << header;
00325 }
00326 }
00327
00328
00329
00330 Array<gnSeqC> array_buf( 2 * BUFFER_SIZE );
00331 gnSeqC *bases = array_buf.data;
00332
00333 for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){
00334 gnFragmentSpec* subSpec = spec->GetSpec(specI);
00335
00336
00337 m_ofstream << "LOCUS ";
00338
00339 string contigName = subSpec->GetName();
00340 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH)
00341 contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH);
00342 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length();
00343 m_ofstream << contigName << string(filler_size, ' ');
00344
00345 string length_string = uintToString(subSpec->GetLength());
00346 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00347 m_ofstream << string(filler_size, ' ') << length_string << " bp ";
00348
00349 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' ');
00350 uint32 head_look_i = 0;
00351 gnBaseHeader* gpbh = NULL;
00352 try{
00353 gpbh = subSpec->GetHeader("LOCUS", head_look_i);
00354 }catch(gnException& gne){}
00355 if( gpbh != NULL )
00356 dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH);
00357 m_ofstream << dnatype << string(2, ' ');
00358
00359 string circular = subSpec->IsCircular() ? string("circular ") : string(10, ' ');
00360 m_ofstream << circular;
00361
00362 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' ');
00363 if(gpbh != NULL)
00364 division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH);
00365 m_ofstream << division;
00366
00367 string date = string(SEQ_LOCUS_DATE_LENGTH, ' ');
00368 if(gpbh != NULL)
00369 date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH);
00370 m_ofstream << string(7, ' ') << date << "\r\n";
00371
00372
00373 WriteHeader(subSpec, "DEFINITION", m_ofstream);
00374 WriteHeader(subSpec, "ACCESSION", m_ofstream);
00375 WriteHeader(subSpec, "VERSION", m_ofstream);
00376 WriteHeader(subSpec, "KEYWORDS", m_ofstream);
00377 WriteHeader(subSpec, "SEGMENT", m_ofstream);
00378 WriteHeader(subSpec, "SOURCE", m_ofstream);
00379 WriteHeader(subSpec, "REFERENCE", m_ofstream);
00380 WriteHeader(subSpec, "COMMENT", m_ofstream);
00381
00382
00383 m_ofstream << "FEATURES Location/Qualifiers" << "\r\n";
00384 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){
00385
00386 gnBaseFeature *gpmf = subSpec->GetFeature(featureI);
00387 string featureName = gpmf->GetName();
00388 m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName;
00389 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' ');
00390
00391 uint32 location_count = gpmf->GetLocationListLength();
00392 uint32 line_pos = SEQ_FEATURE_LOC_OFFSET;
00393 uint32 parenthesis_count = 0;
00394 if(location_count > 1){
00395 m_ofstream << "join(";
00396 line_pos += 5;
00397 parenthesis_count++;
00398 }
00399 gnLocation::gnLocationType loc_type = gpmf->GetLocationType();
00400 switch(loc_type){
00401 case gnLocation::LT_Standard:
00402 break;
00403 case gnLocation::LT_Complement:
00404 m_ofstream << "complement(";
00405 line_pos += 11;
00406 parenthesis_count++;
00407 break;
00408 case gnLocation::LT_Order:
00409 m_ofstream << "order(";
00410 line_pos += 6;
00411 parenthesis_count++;
00412 break;
00413 case gnLocation::LT_Group:
00414 m_ofstream << "group(";
00415 parenthesis_count++;
00416 line_pos += 6;
00417 break;
00418 case gnLocation::LT_OneOf:
00419 m_ofstream << "one-of(";
00420 parenthesis_count++;
00421 line_pos += 7;
00422 break;
00423 default:
00424 break;
00425 }
00426
00427 string location;
00428 for(uint32 locationI = 0; locationI < location_count; locationI++){
00429 gnLocation gpl = gpmf->GetLocation(locationI);
00430 if(gpl.IsStartBoundLonger())
00431 location += ">";
00432 if(gpl.IsStartBoundShorter())
00433 location += "<";
00434 location += uintToString(gpl.GetStart());
00435 gnSeqI end_loc = gpl.GetEnd();
00436 if(end_loc != 0){
00437 switch(gpl.GetType()){
00438 case gnLocation::LT_BetweenBases:
00439 location += "^";
00440 break;
00441 case gnLocation::LT_OneOf:
00442 location += ".";
00443 break;
00444 default:
00445 location += "..";
00446 break;
00447 }
00448 if(gpl.IsEndBoundShorter())
00449 location += "<";
00450 if(gpl.IsEndBoundLonger())
00451 location += ">";
00452 location+= uintToString(end_loc);
00453 }
00454 if(locationI +1 < location_count)
00455 location += ",";
00456 else{
00457 for(;parenthesis_count > 0; parenthesis_count--)
00458 location += ")";
00459 }
00460
00461 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){
00462 m_ofstream << location;
00463 line_pos += location.length();
00464 }else{
00465 m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location;
00466 line_pos = SEQ_FEATURE_LOC_OFFSET + location.length();
00467 }
00468 location = "";
00469 }
00470 m_ofstream << "\r\n";
00471
00472
00473
00474 uint32 qualifier_count = gpmf->GetQualifierListLength();
00475 for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00476 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' ');
00477 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI);
00478 m_ofstream << "/" << qualifier->GetName() << "=";
00479
00480 string qually = string(qualifier->GetValue());
00481
00482
00483 m_ofstream << qually << "\r\n";
00484 }
00485 if(gpmf != NULL)
00486 delete gpmf;
00487 }
00488
00489
00490 gnSeqI readOffset = seq.contigStart(specI);
00491 gnSeqI readLength = seq.contigLength(specI);
00492
00493
00494 m_ofstream << "BASE COUNT ";
00495 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00496 gnSeqI countLen = readLength + readOffset;
00497 for(gnSeqI countI = readOffset; countI < countLen;){
00498 gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00499 if(!seq.ToArray(bases, writeLen, countI))
00500 return false;
00501 gnSeqI a, c, g, t, other;
00502 BaseCount(string(bases, writeLen), a, c, g, t, other);
00503 a_count += a;
00504 c_count += c;
00505 g_count += g;
00506 t_count += t;
00507 other_count += other;
00508 countI += writeLen;
00509 }
00510 m_ofstream << uintToString(a_count) << " a ";
00511 m_ofstream << uintToString(c_count) << " c ";
00512 m_ofstream << uintToString(g_count) << " g ";
00513 m_ofstream << uintToString(t_count) << " t ";
00514 m_ofstream << uintToString(other_count) << " others" << "\r\n";
00515
00516 string origin = "ORIGIN\r\n";
00517 head_look_i = 0;
00518 try{
00519 gpbh = subSpec->GetHeader("ORIGIN", head_look_i);
00520 origin = gpbh->GetHeader();
00521 m_ofstream << origin;
00522 }catch(gnException& gne){
00523 m_ofstream << "ORIGIN" << endl;
00524 }
00525
00526 gnSeqI contig_bases = 0;
00527 while(readLength > 0){
00528 gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00529 boolean success = seq.ToArray(bases, writeLen, readOffset);
00530 if(!success)
00531 return false;
00532
00533 for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00534 string baseIndexStr = uintToString(contig_bases + curbaseI +1);
00535 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' ');
00536 m_ofstream << baseIndexStr;
00537 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00538 if(writeLen <= curbaseI + base_offset)
00539 break;
00540 int64 print_length = writeLen - (curbaseI + base_offset);
00541 print_length = print_length > 10 ? 10 : print_length;
00542 m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length);
00543 }
00544 m_ofstream << "\r\n";
00545 }
00546 readLength -= writeLen;
00547 readOffset += writeLen;
00548 contig_bases += writeLen;
00549 }
00550 m_ofstream << "//\r\n";
00551 }
00552
00553 m_ofstream.close();
00554 return true;
00555 }
00556
00557 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{
00558 if(m_contigList.size() > contigI)
00559 return m_contigList[contigI];
00560 return NULL;
00561 }
00562
00563
00564 boolean gnGBKSource::ParseStream( istream& fin )
00565 {
00566
00567 uint32 readState = 0;
00568 uint32 lineStart = 0;
00569
00570 uint32 sectionStart = 0;
00571 uint64 streamPos = 0;
00572 uint64 bufReadLen = 0;
00573 uint64 remainingBuffer = 0;
00574 Array<char> array_buf( BUFFER_SIZE );
00575 char* buf = array_buf.data;
00576 gnFragmentSpec* curFrag = 0;
00577 gnSourceSpec* curSpec = 0;
00578 gnSourceHeader *curHeader;
00579 gnFeature* curFeature;
00580 gnFileContig* curContig = 0;
00581 gnLocation::gnLocationType curBaseLocationType;
00582 gnSeqI curLocationStart;
00583 int32 curStartLength = 0;
00584 int32 curEndLength = 0;
00585 string curLocContig = "";
00586 string curQualifierName;
00587 uint64 curQualifierStart;
00588 string curContigName = "";
00589 gnSeqI seqLength = 0;
00590 gnSeqI seqChunk, seqChunkCount, gapChunk;
00591 boolean corruptWarning = false;
00592
00593
00594 DetermineNewlineType();
00595
00596 m_spec = new gnGenomeSpec();
00597 while( !fin.eof() )
00598 {
00599 if(sectionStart > 0){
00600 if(readState == 14)
00601 sectionStart = lineStart;
00602 remainingBuffer = bufReadLen - sectionStart;
00603 memmove(buf, buf+sectionStart, remainingBuffer);
00604 }
00605
00606 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00607 streamPos -= remainingBuffer;
00608 lineStart -= sectionStart;
00609 sectionStart = 0;
00610 bufReadLen = fin.gcount();
00611 bufReadLen += remainingBuffer;
00612
00613 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00614 {
00615 char ch = buf[i];
00616 switch( readState )
00617 {
00618 case 0:
00619
00620 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){
00621 if(curSpec == NULL){
00622 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00623 curFrag = new gnFragmentSpec();
00624 curFrag->AddSpec(curSpec);
00625 curSpec->SetSourceName(m_openString);
00626 m_spec->AddSpec(curFrag);
00627 }
00628 if(lineStart != sectionStart){
00629 uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00630 for(; j > 0; j--)
00631 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' '))
00632 break;
00633 string header_name = string(buf+sectionStart, j+1);
00634 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00635
00636 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00637 m_spec->AddHeader(curHeader);
00638 else
00639 curFrag->AddHeader(curHeader);
00640 sectionStart = lineStart;
00641 }
00642
00643 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00644 sectionStart = i + 1;
00645 readState = 1;
00646 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00647 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00648 curFrag->AddHeader(curHeader);
00649 curContig = new gnFileContig();
00650 curContig->SetName(curContigName);
00651 curContigName = "";
00652 readState = 13;
00653 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00654 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00655 curFrag->SetCircular(true);
00656 uint32 j = SEQ_LOCUS_NAME_LENGTH+1;
00657 for(; j > 0; j--)
00658 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00659 break;
00660 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00661 curFrag->SetName(curContigName);
00662 }
00663 }
00664 if(ch == '\n'){
00665 lineStart = i + 1;
00666 }
00667 break;
00668 case 1:
00669 if((ch == ' ')||(ch == ' ')){
00670 break;
00671 }else if(ch == '\n'){
00672 lineStart = i + 1;
00673 sectionStart = i + 1;
00674 break;
00675 }else if(sectionStart == i){
00676 i--;
00677 readState = 0;
00678 sectionStart = i + 1;
00679 break;
00680 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){
00681 sectionStart = i;
00682 readState = 2;
00683 }
00684 case 2:
00685 if((ch == ' ')||(ch == ' ')){
00686 string featureName(buf+sectionStart, i - sectionStart);
00687 curFeature = new gnFeature(featureName);
00688 curFrag->AddFeature(curFeature);
00689 sectionStart = i + 1;
00690 readState = 3;
00691 }
00692 break;
00693 case 3:
00694 if((ch == ' ')||(ch == ' ')){
00695 break;
00696 }else if((ch == '\r')||(ch == '\n')){
00697 lineStart = i+1;
00698 break;
00699 }
00700 sectionStart = i;
00701 readState = 4;
00702
00703
00704
00705 case 4:
00706 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00707 string starter(buf+sectionStart, i - sectionStart);
00708 if(ch == '('){
00709 if(starter == "complement")
00710 curFeature->SetLocationType(gnLocation::LT_Complement);
00711 else if(starter == "order")
00712 curFeature->SetLocationType(gnLocation::LT_Order);
00713 else if(starter == "group")
00714 curFeature->SetLocationType(gnLocation::LT_Group);
00715 else if(starter == "one-of")
00716 curFeature->SetLocationType(gnLocation::LT_OneOf);
00717 sectionStart = i + 1;
00718 break;
00719 }else if(ch == ':'){
00720 curLocContig = starter;
00721 sectionStart = i + 1;
00722 break;
00723 }
00724 curLocationStart = atoi(starter.c_str());
00725 readState = 6;
00726 if(ch == '.'){
00727
00728 readState = 5;
00729 sectionStart = i + 1;
00730 break;
00731 }else if(ch == '^'){
00732 curBaseLocationType = gnLocation::LT_BetweenBases;
00733 }else if((ch == ' ')||(ch == ' ')){
00734
00735 gnLocation curLocation(curLocationStart, curLocationStart);
00736 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00737 readState = 7;
00738 }
00739 sectionStart = i + 1;
00740
00741 }else if(ch == '<'){
00742 curStartLength = -1;
00743 sectionStart = i + 1;
00744 }else if(ch == '>'){
00745 curStartLength = 1;
00746 sectionStart = i + 1;
00747 }
00748 break;
00749 case 5:
00750 if(ch == '.'){
00751 curBaseLocationType = gnLocation::LT_Standard;
00752 readState = 6;
00753 sectionStart = i + 1;
00754 break;
00755 }
00756 curBaseLocationType = gnLocation::LT_OneOf;
00757 case 6:
00758 if(ch == '>'){
00759 curEndLength = 1;
00760 sectionStart = i + 1;
00761 }else if(ch == '<'){
00762 curEndLength = -1;
00763 sectionStart = i + 1;
00764 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){
00765
00766 string ender(buf+sectionStart, i - sectionStart);
00767 gnSeqI curLocationEnd = atoi(ender.c_str());
00768 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00769 curEndLength = 0;
00770 curStartLength = 0;
00771 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00772 readState = ch == ',' ? 3 : 7;
00773 sectionStart = i+1;
00774 }
00775 break;
00776 case 7:
00777 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){
00778 sectionStart = i;
00779 readState = 0;
00780 i--;
00781 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){
00782 sectionStart = i;
00783 readState = 2;
00784 i--;
00785 }else if(ch == ','){
00786 sectionStart = i+1;
00787 readState = 3;
00788 }else if(ch == '/'){
00789 sectionStart = i+1;
00790 readState = 8;
00791 }else if(ch == '\n')
00792 lineStart = i + 1;
00793 break;
00794 case 8:
00795 if(ch == '='){
00796 curQualifierName = string(buf+sectionStart, i - sectionStart);
00797 readState = 9;
00798 sectionStart = i+1;
00799 }else if( ch == '\r' || ch == '\n' ){
00800
00801 curQualifierName = string(buf+sectionStart, i - sectionStart);
00802 curFeature->AddQualifier( new gnStringQualifier( curQualifierName, "" ));
00803 readState = 7;
00804 sectionStart = i+1;
00805 }
00806 break;
00807 case 9:
00808 if(ch == '"'){
00809 readState = 10;
00810 sectionStart = i;
00811 curQualifierStart = i + streamPos;
00812 }else if(ch == '['){
00813 readState = 11;
00814 sectionStart = i;
00815 }else if((ch == '\r')||(ch == '\n')){
00816 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00817 sectionStart = i+1;
00818 readState = 7;
00819 }
00820 break;
00821 case 10:
00822 if(ch == '"')
00823 readState = 11;
00824 if(ch == '\n'){
00825 lineStart = i + 1;
00826 }
00827 break;
00828 case 11:
00829 if(ch != '"'){
00830 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00831 sectionStart = i+1;
00832 readState = 7;
00833 if(ch == '\n')
00834 lineStart = i + 1;
00835 }else
00836 readState = 10;
00837 break;
00838 case 12:
00839 if(ch == ']'){
00840 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00841 sectionStart = i+1;
00842 readState = 7;
00843 }
00844 break;
00845 case 13:
00846 curContig->SetSectStart(gnContigSequence, i - 1 + streamPos);
00847 curContig->SetRepeatSeqGap(true);
00848 seqChunk = 0;
00849 seqChunkCount = 0;
00850 gapChunk = m_newlineSize + 1;
00851 readState = 14;
00852 break;
00853 case 14:
00854 while(i < bufReadLen){
00855 ch = buf[i];
00856 if((ch == '/')&&(i==lineStart)){
00857 readState = 15;
00858 break;
00859 }else if(m_pFilter->IsValid(ch)){
00860 if(gapChunk > 0){
00861 if((gapChunk > 1 && seqChunkCount > 0) ||
00862 (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){
00863 if( !corruptWarning ){
00864 ErrorMsg("File is corrupt. Proceed with caution.");
00865 corruptWarning = true;
00866 }
00867 curContig->SetRepeatSeqGap(false);
00868 }
00869 gapChunk = 0;
00870 }
00871 seqChunk++;
00872 seqLength++;
00873 }else{
00874 gapChunk++;
00875 if(seqChunk == 10){
00876 seqChunk = 0;
00877 seqChunkCount++;
00878 if(seqChunkCount == 6){
00879
00880 seqChunkCount = 0;
00881 }
00882 }
00883 if(ch == '\n')
00884 lineStart = i + 1;
00885 }
00886 i++;
00887 }
00888 break;
00889 case 15:
00890 if((ch == '\n')&&(buf[lineStart+1] == '/')){
00891 curContig->SetSectEnd(gnContigSequence, lineStart - m_newlineSize + streamPos);
00892 curContig->SetSeqLength(seqLength);
00893 m_contigList.push_back(curContig);
00894 curContig = 0;
00895 curSpec->SetLength(seqLength);
00896 curSpec = 0;
00897 seqLength = 0;
00898 lineStart = i + 1;
00899 sectionStart = i + 1;
00900 readState = 0;
00901 }
00902 break;
00903 }
00904 }
00905 streamPos += bufReadLen;
00906 }
00907 if(curContig != 0){
00908 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00909 curContig->SetSeqLength(seqLength);
00910 m_contigList.push_back(curContig);
00911 curSpec->SetLength(seqLength);
00912 }
00913 if(curSpec != 0)
00914 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00915 &&(curSpec->GetLength() == 0)){
00916 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00917 delete curFrag;
00918 }
00919 m_ifstream.clear();
00920 return true;
00921 }