00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00012 #include "gn/gnFASSource.h"
00013 #include "gn/gnBaseSpec.h"
00014 #include "gn/gnFilter.h"
00015 #include "gn/gnStringTools.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnDebug.h"
00019
00020 gnFASSource::gnFASSource()
00021 {
00022 m_openString = "";
00023 m_pFilter = gnFilter::proteinSeqFilter();
00024 if(m_pFilter == NULL){
00025 DebugMsg("Error using static sequence filters.");
00026 }
00027 }
00028 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00029 {
00030 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00031 for( ; iter != s.m_contigList.end(); ++iter )
00032 {
00033 m_contigList.push_back( (*iter)->Clone() );
00034 }
00035 }
00036 gnFASSource::~gnFASSource()
00037 {
00038 m_ifstream.close();
00039 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00040 for( ; iter != m_contigList.end(); ++iter )
00041 {
00042 gnFileContig* fg = *iter;
00043 *iter = 0;
00044 delete fg;
00045 }
00046 }
00047 boolean gnFASSource::HasContig( const string& nameStr ) const
00048 {
00049 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00050 {
00051 if( nameStr == m_contigList[i]->GetName() )
00052 return true;
00053 }
00054 return false;
00055 }
00056 uint32 gnFASSource::GetContigID( const string& name ) const
00057 {
00058 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00059 {
00060 if( name == m_contigList[i]->GetName() )
00061 return i;
00062 }
00063 return ALL_CONTIGS;
00064 }
00065 string gnFASSource::GetContigName( const uint32 i ) const
00066 {
00067 if( i < m_contigList.size() ){
00068 return m_contigList[i]->GetName();
00069 }
00070 return "";
00071 }
00072 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00073 {
00074 if( i < m_contigList.size() ){
00075 return m_contigList[i]->GetSeqLength();
00076 }else if( i == ALL_CONTIGS){
00077 gnSeqI seqlen = 0;
00078 for(uint32 j=0; j < m_contigList.size(); j++)
00079 seqlen += m_contigList[j]->GetSeqLength();
00080 return seqlen;
00081 }
00082 return GNSEQI_ERROR;
00083 }
00084 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00085 {
00086 if( i < m_contigList.size() ){
00087 return m_contigList[i];
00088 }
00089 return 0;
00090 }
00091
00092 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI )
00093 {
00094 m_ifstream.clear();
00095 uint32 contigIndex = contigI;
00096 uint64 startPos = 0;
00097 uint64 readableBytes = 0;
00098 if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00099 bufLen = 0;
00100 return false;
00101 }
00102
00103 if( contigI == ALL_CONTIGS ){
00104 uint32 curLen = 0;
00105 uint64 bytesRead = 0;
00106 while (curLen < bufLen){
00107
00108 if(readableBytes <= 0)
00109 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00110 bufLen = curLen;
00111 return true;
00112 }
00113
00114 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00115 Array<gnSeqC> array_buf( readLen );
00116 gnSeqC* tmpBuf = array_buf.data;
00117
00118
00119 m_ifstream.read(tmpBuf, readLen);
00120 uint64 gc = m_ifstream.gcount();
00121 bytesRead += gc;
00122 readableBytes -= gc;
00123
00124
00125 for(uint32 i=0; i < gc; i++){
00126 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127 buf[curLen] = tmpBuf[i];
00128 curLen++;
00129 }
00130 }
00131 if(m_ifstream.eof()){
00132 m_ifstream.clear();
00133 bufLen = curLen;
00134 return true;
00135 }
00136 }
00137 bufLen = curLen;
00138 }
00139 else if( contigI < m_contigList.size() )
00140 {
00141 uint32 curLen = 0;
00142
00143 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00144 bufLen = bufLen < contigSize ? bufLen : contigSize;
00145 while (curLen < bufLen)
00146 {
00147 uint64 readLen = bufLen - curLen;
00148 Array<gnSeqC> array_buf( readLen );
00149 gnSeqC* tmpBuf = array_buf.data;
00150
00151
00152 m_ifstream.read(tmpBuf, readLen);
00153 uint64 gc = m_ifstream.gcount();
00154
00155
00156 for(uint32 i=0; i < gc; i++){
00157 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158 buf[curLen] = tmpBuf[i];
00159 curLen++;
00160 }
00161 }
00162 if(m_ifstream.eof()){
00163 m_ifstream.clear();
00164 bufLen = curLen;
00165 return true;
00166 }
00167 }
00168 bufLen = curLen;
00169 }
00170 return true;
00171 }
00172
00173
00174
00175
00176
00177
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180 if( contigI == ALL_CONTIGS ){
00181
00182 gnSeqI curIndex = 0;
00183 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184 for( ; iter != m_contigList.end(); ++iter )
00185 {
00186 uint64 len = (*iter)->GetSeqLength();
00187 if( (curIndex + len) > start )
00188 break;
00189 curIndex += len;
00190 }
00191 if( iter == m_contigList.end() )
00192 return false;
00193
00194 gnSeqI startIndex = start - curIndex;
00195 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196 }
00197 else if( contigI < m_contigList.size() )
00198 {
00199 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200 }
00201 return false;
00202 }
00203
00204
00205 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00206 readableBytes = 0;
00207 uint32 curLen = 0;
00208
00209 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00210
00211 if(contig.HasRepeatSeqGap())
00212 if(contig.GetRepeatSeqGapSize().first > 0)
00213 if(contig.GetRepeatSeqGapSize().second > 0){
00214
00215 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00216 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00217 m_ifstream.seekg( startPos, ios::beg );
00218 return true;
00219 }
00220 m_ifstream.seekg( startPos, ios::beg );
00221 if( m_ifstream.eof() ){
00222 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00223 return false;
00224 }
00225 while( true ){
00226
00227
00228 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00229 if(tmpbufsize == 0){
00230 ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00231 return false;
00232 }
00233 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00234 Array<char> array_buf( tmpbufsize );
00235 char *tmpbuf = array_buf.data;
00236 m_ifstream.read( tmpbuf, tmpbufsize );
00237 if( m_ifstream.eof() ){
00238 ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00239 return false;
00240 }
00241 for( uint32 i=0; i < tmpbufsize; ++i ){
00242 if( m_pFilter->IsValid(tmpbuf[i]) ){
00243 if( curLen >= start ){
00244 startPos += i;
00245 m_ifstream.seekg( startPos, ios::beg );
00246 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00247 return true;
00248 }
00249 ++curLen;
00250 }
00251 }
00252 startPos += tmpbufsize;
00253 }
00254 return true;
00255 }
00256
00257
00258 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00259 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00260 if(!m_ofstream.is_open())
00261 return false;
00262 uint32 contigCount = source->GetContigListLength();
00263 for(uint32 contigI = 0; contigI < contigCount; contigI++){
00264 string contigName = source->GetContigName(contigI);
00265 m_ofstream << ">" << contigName << ";\n";
00266 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00267 while(seqLength > 0){
00268 gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;
00269 Array<gnSeqC> array_buf( writeLen+1 );
00270 gnSeqC *bases = array_buf.data;
00271 boolean success = source->SeqRead(0, bases, writeLen, contigI);
00272 if(!success)
00273 return false;
00274 bases[writeLen] = '\0';
00275 m_ofstream << bases << "\n";
00276 seqLength -= writeLen;
00277 }
00278 }
00279 m_ofstream.close();
00280 return true;
00281 }
00282
00283
00284 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00285 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00286 if(!m_ofstream.is_open())
00287 Throw_gnEx(FileNotOpened());
00288 Write(seq, m_ofstream, write_coords, enforce_unique_names);
00289 m_ofstream.close();
00290 }
00291
00292 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){
00293 vector<string> contigNameList;
00294 Array<gnSeqC> array_buf( BUFFER_SIZE );
00295 gnSeqC *bases = array_buf.data;
00296 gnGenomeSpec* spec = seq.GetSpec();
00297
00298
00299 string newline = "\r\n";
00300 gnSeqI readOffset = 1;
00301
00302 for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00303
00304 string contigName = seq.contigName(fragI);
00305
00306 if(enforce_unique_names){
00307 uint32 name_count = 0;
00308 for(uint32 i=0; i < contigNameList.size(); i++)
00309 if(contigNameList[i] == contigName)
00310 name_count++;
00311 contigNameList.push_back(contigName);
00312 if(name_count > 0)
00313 contigName += "_" + uintToString(name_count);
00314 }
00315
00316 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00317 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00318 string header = "";
00319 if(gpbh != NULL){
00320 header = gpbh->GetHeader();
00321
00322 uint32 newlinepos = header.find_first_of('\n', 0);
00323 if(newlinepos != string::npos){
00324 if(header[newlinepos-1] == '\r')
00325 newlinepos--;
00326 header = header.substr(0, newlinepos);
00327 }
00328 }
00329
00330
00331 gnSeqI readLength = seq.contigLength(fragI);
00332 m_ostream << ">" << contigName;
00333 if(write_coords)
00334 m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00335 m_ostream << "; " << header << newline;
00336
00337 gnSeqI linePos = 0;
00338 while(readLength > 0){
00339 gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00340 gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00341
00342 if(!seq.ToArray(bases, writeLen, readOffset))
00343 return;
00344 for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00345 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00346 m_ostream << string(bases + curbase, writeout_size) << newline;
00347 }
00348 readLength -= writeLen;
00349 readOffset += writeLen;
00350 }
00351 if(linePos != 0)
00352 m_ostream << newline;
00353 }
00354
00355 }
00356
00357 gnGenomeSpec *gnFASSource::GetSpec() const{
00358 gnGenomeSpec *spec = new gnGenomeSpec();
00359 for(uint32 i=0; i < m_contigList.size(); i++){
00360
00361 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00362 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00363
00364 spec->AddSpec(fragmentSpec, i);
00365 fragmentSpec->AddSpec(contigSpec);
00366
00367 fragmentSpec->SetName(m_contigList[i]->GetName());
00368 fragmentSpec->SetSourceName(m_openString);
00369 contigSpec->SetName(m_contigList[i]->GetName());
00370 contigSpec->SetSourceName(m_openString);
00371
00372 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00373 if(headsect.first != headsect.second){
00374 gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00375 fragmentSpec->AddHeader(gpsh, 0);
00376 }
00377 }
00378 return spec;
00379 }
00380
00381 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00382 if(m_contigList.size() > contigI)
00383 return m_contigList[contigI];
00384 return NULL;
00385 }
00386
00387 boolean gnFASSource::ParseStream( istream& fin )
00388 {
00389
00390 uint32 readState = 0;
00391 gnFileContig* currentContig = 0;
00392 string nameFStr;
00393 uint64 seqLength = 0, gapLength = 0;
00394
00395 uint64 streamPos = 0;
00396 uint64 bufReadLen = 0;
00397 Array<gnSeqC> array_buf( BUFFER_SIZE );
00398 char* buf = array_buf.data;
00399 boolean paren_hit = false;
00400 uint32 repeatSeqSize = 0;
00401 boolean corrupt_msg = false;
00402
00403
00404 DetermineNewlineType();
00405
00406
00407 while( !fin.eof() )
00408 {
00409
00410 fin.read( buf, BUFFER_SIZE);
00411 streamPos += bufReadLen;
00412 bufReadLen = fin.gcount();
00413 for( uint32 i=0 ; i < bufReadLen ; i++ )
00414 {
00415 char ch = buf[i];
00416 switch( readState )
00417 {
00418 case 0:
00419 if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00420 {
00421 return false;
00422 }
00423 readState = 1;
00424 case 1:
00425 if( ch == '>' )
00426 {
00427 seqLength = 0; gapLength = 0;
00428 currentContig = new gnFileContig();
00429 currentContig->SetFileStart( streamPos + i );
00430 currentContig->SetRepeatSeqGap(true);
00431 currentContig->SetRepeatSeqSize( repeatSeqSize );
00432 currentContig->SetRepeatGapSize( m_newlineSize );
00433 readState = 2;
00434 paren_hit = false;
00435 nameFStr = "";
00436 }
00437 else
00438 ++gapLength;
00439 break;
00440 case 2:
00441 if( isNewLine(ch) || ch == ';')
00442 {
00443 currentContig->SetName( nameFStr );
00444 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00445 if( ch == ';' )
00446 readState = 3;
00447 else{
00448 readState = 4;
00449 if(ch == '\r')
00450 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00451 }
00452 }
00453 else if( ch == '(' ){
00454 if(isSpace(buf[i-1])){
00455
00456 nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00457 }
00458 paren_hit = true;
00459 }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00460 nameFStr += ch;
00461 break;
00462 case 3:
00463 if( isNewLine(ch) )
00464 readState = 4;
00465 break;
00466 case 4:
00467 if( ch == '>' )
00468 {
00469 readState = 3;
00470 }
00471 else if( m_pFilter->IsValid(ch) )
00472 {
00473 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00474 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00475 seqLength = 1; gapLength = 0;
00476 readState = 5;
00477 }
00478 break;
00479 case 5:
00480 while(i < bufReadLen){
00481 ch = buf[i];
00482 if( m_pFilter->IsValid(ch) ){
00483 if(gapLength > 0){
00484 if(seqLength != repeatSeqSize){
00485 if( !corrupt_msg ){
00486 corrupt_msg = true;
00487 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00488 }
00489 currentContig->SetRepeatSeqGap(false);
00490 }if(gapLength != m_newlineSize){
00491
00492 if( !corrupt_msg ){
00493 corrupt_msg = true;
00494 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00495 }
00496 currentContig->SetRepeatSeqGap(false);
00497 }
00498 currentContig->AddToSeqLength( seqLength );
00499 seqLength = 0;
00500 gapLength = 0;
00501 }
00502 seqLength++;
00503 }else if( ch == '>'){
00504 currentContig->AddToSeqLength( seqLength );
00505 currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00506 currentContig->SetFileEnd( streamPos + i - 1 );
00507 m_contigList.push_back(currentContig);
00508 readState = 1;
00509 i--;
00510 break;
00511 }
00512 else if( isNewLine(ch) ){
00513 if( repeatSeqSize == 0 ){
00514 repeatSeqSize = seqLength;
00515 currentContig->SetRepeatSeqSize( repeatSeqSize );
00516 }
00517 gapLength++;
00518 }
00519 else{
00520
00521 currentContig->SetRepeatSeqGap(false);
00522 if( !corrupt_msg ){
00523 corrupt_msg = true;
00524 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" );
00525 }
00526 }
00527 i++;
00528 }
00529 break;
00530 default:
00531 ErrorMsg("ERROR");
00532 return false;
00533 break;
00534 }
00535 }
00536 }
00537
00538 if( currentContig != 0 )
00539 {
00540 if( readState == 2 )
00541 {
00542 currentContig->SetName( nameFStr );
00543 }
00544 if( (readState >= 2) && (readState < 5) )
00545 {
00546 currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00547 }
00548 else if( readState == 5 )
00549 {
00550 currentContig->AddToSeqLength( seqLength );
00551 currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00552 }
00553 currentContig->SetFileEnd( streamPos + bufReadLen );
00554 m_contigList.push_back(currentContig);
00555 }
00556 m_ifstream.clear();
00557 return true;
00558 }