src/gnSequence.cpp

Go to the documentation of this file.
00001 
00002 // File:            gnSequence.h
00003 // Purpose:         Source Seq, where sequence is stored in source file
00004 // Description:     implements the baseSeq interface for dna source files.
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00012 #include "gn/gnSequence.h"
00013 #include "gn/gnSourceFactory.h"
00014 #include "gn/gnBaseSource.h"
00015 #include "gn/gnGenomeSpec.h"
00016 #include "gn/gnFragmentSpec.h"
00017 #include "gn/gnSourceSpec.h"
00018 #include "gn/gnStringSpec.h"
00019 #include "gn/gnStringHeader.h"
00020 
00021 // Constructor
00022 gnSequence::gnSequence( )
00023 {
00024         spec = new gnGenomeSpec();
00025         comparator = gnCompare::DNASeqCompare();
00026 }
00027 gnSequence::gnSequence( const gnSeqC* seq){
00028         spec = new gnGenomeSpec();
00029         if (seq[0] != 0){
00030                 gnFragmentSpec *fragSpec = new gnFragmentSpec();
00031                 spec->AddSpec(fragSpec);
00032                 fragSpec->AddSpec(new gnStringSpec(seq));
00033         }
00034         comparator = gnCompare::DNASeqCompare();
00035 }
00036 gnSequence::gnSequence( const string& str){
00037         spec = new gnGenomeSpec();
00038         if (str.length() != 0){
00039                 gnFragmentSpec *fragSpec = new gnFragmentSpec();
00040                 spec->AddSpec(fragSpec);
00041                 fragSpec->AddSpec(new gnStringSpec(str));
00042         }
00043         comparator = gnCompare::DNASeqCompare();
00044 }
00045 gnSequence::gnSequence( const gnGenomeSpec& gngs){
00046         spec = gngs.Clone();
00047         comparator = gnCompare::DNASeqCompare();
00048 }
00049 gnSequence::gnSequence( const gnFragmentSpec& gnfs){
00050         spec = new gnGenomeSpec();
00051         spec->AddSpec(gnfs.Clone());
00052         comparator = gnCompare::DNASeqCompare();
00053 }
00054 gnSequence::gnSequence( const gnContigSpec& gcs){
00055         spec = new gnGenomeSpec();
00056         gnFragmentSpec *fragSpec = new gnFragmentSpec();
00057         fragSpec->AddSpec(gcs.Clone());
00058         comparator = gnCompare::DNASeqCompare();
00059 }
00060 gnSequence::gnSequence( gnSeqC *bases, gnSeqI length){
00061         spec = new gnGenomeSpec();
00062         if (length > 0){
00063                 gnFragmentSpec *fragSpec = new gnFragmentSpec();
00064                 spec->AddSpec(fragSpec);
00065                 fragSpec->AddSpec(new gnStringSpec(string(bases, length)));
00066         }
00067         comparator = gnCompare::DNASeqCompare();
00068 }       
00069 gnSequence::gnSequence( const gnSequence& seq )
00070 {
00071         *this = seq;
00072 }
00073 
00074 // Destructor
00075 gnSequence::~gnSequence()
00076 {
00077         if (spec != NULL) 
00078                 delete spec;
00079 }
00080 
00081 gnSequence& gnSequence::operator=( const gnSequence& seq){
00082         spec = seq.spec->Clone();
00083         filter_list = seq.filter_list;
00084         comparator = seq.comparator;
00085         return *this;
00086 }
00087 
00088 // Clone
00089 gnSequence* gnSequence::Clone() const
00090 {
00091         return new gnSequence( *this );
00092 }
00093 //IMPLEMENT ME!! Consider ambiguities!
00094 int gnSequence::compare(const string& str) const{
00095         STACK_TRACE_START
00096                 gnSeqI len = length();
00097                 gnSeqI seq_len = str.length();
00098                 gnSeqI comparelen = len < seq_len ? len : seq_len;
00099                 gnSeqI compared = 0;
00100                 while(comparelen > 0){
00101                         gnSeqI curlen = comparelen < BUFFER_SIZE ? comparelen : BUFFER_SIZE;
00102                         string bases = ToString(compared, curlen);
00103                         string seq_bases = str.substr(compared, curlen);
00104                         if(comparator->LessThan(bases, seq_bases))
00105                                 return -1;
00106                         else if(comparator->LessThan(seq_bases, bases))
00107                                 return 1;
00108 
00109                         compared += curlen;
00110                         comparelen -= curlen;
00111                 }
00112                 if(len < seq_len)
00113                         return -1;
00114                 else if(len > seq_len)
00115                         return 1;
00116                 return 0;
00117         STACK_TRACE_END
00118 }
00119 
00120 int gnSequence::compare(const gnSequence& seq) const{
00121         STACK_TRACE_START
00122                 gnSeqI len = length();
00123                 gnSeqI seq_len = seq.length();
00124                 gnSeqI comparelen = len < seq_len ? len : seq_len;
00125                 gnSeqI compared = 0;
00126                 while(comparelen > 0){
00127                         gnSeqI curlen = comparelen < BUFFER_SIZE ? comparelen : BUFFER_SIZE;
00128                         string bases = ToString(curlen, compared+1);
00129                         string seq_bases = seq.ToString(curlen, compared+1);
00130                         if(comparator->LessThan(bases, seq_bases))
00131                                 return -1;
00132                         else if(comparator->LessThan(seq_bases, bases))
00133                                 return 1;
00134 
00135                         compared+= curlen;
00136                         comparelen -= curlen;
00137                 }
00138                 if(len < seq_len)
00139                         return -1;
00140                 else if(len > seq_len)
00141                         return 1;
00142                 return 0;
00143         STACK_TRACE_END
00144 }
00145 
00146 void gnSequence::insert( const gnSeqI offset, const gnSeqC *bases, const gnSeqI len){
00147         STACK_TRACE_START
00148                 string str(bases, len);
00149                 gnStringSpec gpbs(str);
00150                 insert(offset, gpbs);
00151         STACK_TRACE_END
00152 }
00153 //Offset is the gene index, not a computer index, to insert before.
00154 //This is the default insert.  It inserts entire contigs
00155 void gnSequence::insert( const gnSeqI offset, const gnGenomeSpec& gpbs){
00156         STACK_TRACE_START
00157                 if(offset == 0)
00158                         Throw_gnEx(SeqIndexOutOfBounds());
00159                 if(offset == GNSEQI_END || spec->GetLength() < offset){ //simple append.
00160                         for(uint32 i=0; i < gpbs.GetSpecListLength(); i++)
00161                                 spec->AddSpec(gpbs.GetSpec(i)->Clone());
00162                         return;
00163                 }
00164                 //clone this sequence
00165                 gnGenomeSpec* tmpSpec = spec->Clone();
00166                 //crop off the end of this sequence past the insert point
00167                 //crop off the beginning of the cloned sequence up to the insert
00168                 gnSeqI real_offset = offset - 1;
00169                 gnSeqI endCrop = spec->GetLength() - real_offset;
00170                 spec->CropEnd(endCrop);
00171                 tmpSpec->CropStart(real_offset);
00172                 //append the inserted sequence and the end of this sequence.
00173                 insert(GNSEQI_END, gpbs);
00174                 insert(GNSEQI_END, *tmpSpec);
00175                 delete tmpSpec;
00176         STACK_TRACE_END
00177 }
00178 // Concatenation operators
00179 gnSequence const gnSequence::operator+(const gnSequence& seq) const{
00180         STACK_TRACE_START
00181                 gnSequence rval(*this);
00182                 rval.append(seq);
00183                 return rval;
00184         STACK_TRACE_END
00185 }
00186 // Subsequences and base deletion
00187 gnSequence gnSequence::subseq(const gnSeqI offset, const gnSeqI length) const{
00188         STACK_TRACE_START
00189                 if(length == 0)
00190                         return gnSequence();
00191                 if(offset == 0)
00192                         Throw_gnEx(SeqIndexOutOfBounds());
00193                 
00194                 gnSequence tmpSeq;
00195                 delete tmpSeq.spec;
00196                 tmpSeq.spec = spec->CloneRange(offset - 1, length);
00197                 return tmpSeq;
00198         STACK_TRACE_END
00199 }
00200 void gnSequence::erase( const gnSeqI offset, const gnSeqI len ){
00201         STACK_TRACE_START
00202                 if(offset == 0)
00203                         Throw_gnEx(SeqIndexOutOfBounds());
00204                 //range checking, etc.
00205                 gnSeqI current_length = length();
00206                 if(offset > current_length)
00207                         Throw_gnEx(SeqIndexOutOfBounds());
00208                 gnSeqI endBase = offset + len - 1 < current_length ? offset + len - 1 : current_length;
00209                 gnSeqI startBase = offset - 1;
00210 
00211                 gnGenomeSpec* tmpSpec = spec->CloneRange(endBase, GNSEQI_END);
00212                 uint32 lennard = tmpSpec->GetLength();
00213                 spec->CropEnd(current_length - startBase);
00214                 lennard = length();
00215                 insert(GNSEQI_END, *tmpSpec);
00216                 delete tmpSpec;
00217         STACK_TRACE_END
00218 }
00219 void gnSequence::splitContig(const gnSeqI splitI, const uint32 contigI){
00220         STACK_TRACE_START
00221                 gnSeqI splitBase = splitI;
00222                 gnSeqI current_length = length();
00223                 if(splitI == 0)
00224                         Throw_gnEx(SeqIndexOutOfBounds());
00225                 if(contigI == ALL_CONTIGS && splitI > current_length)
00226                                 Throw_gnEx(SeqIndexOutOfBounds());
00227                 else
00228                         localToGlobal(contigI, splitBase);
00229                 gnGenomeSpec* tmpSpec = spec->Clone();
00230                 tmpSpec->CropStart(splitBase);
00231                 spec->CropEnd(current_length - splitBase);
00232                 
00233                 insert(GNSEQI_END, *tmpSpec);
00234                 delete tmpSpec;
00235         STACK_TRACE_END
00236 }
00237 
00238 // IO operators
00239 istream& operator>>(istream& is, gnSequence& gps){      //read from source.
00240         string bases;
00241         is >> bases;
00242         gps.append(bases);
00243         return is;
00244 }
00245 ostream& operator<<(ostream& os, const gnSequence& gps){ //write to source.
00246         os << gps.ToString();
00247         return os;
00248 }
00249 string gnSequence::ToString( const gnSeqI len, const gnSeqI offset ) const
00250 {
00251         STACK_TRACE_START
00252                 string str;
00253                 ToString(str, len, offset);
00254                 return str;
00255         STACK_TRACE_END
00256 }
00257 boolean gnSequence::ToString( string& str, const gnSeqI len, const gnSeqI offset ) const
00258 {
00259         STACK_TRACE_START
00260                 gnSeqI real_offset = offset - 1;
00261                 gnSeqI m_length = length();
00262                 gnSeqI readSize = len > m_length - real_offset ? m_length - real_offset : len;
00263                 Array<char> array_buf( readSize+1 );
00264                 char *buf = array_buf.data;
00265                 boolean success = spec->SeqRead( real_offset, buf, readSize, ALL_CONTIGS);
00266                 buf[readSize] = '\0';
00267                 str = buf;
00268 
00269                 //now filter the string
00270                 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin();
00271                 for(; iter != filter_list.end(); iter++)
00272                         (*iter)->Filter(str);
00273 
00274                 if( success )
00275                         return true;
00276                 return false;
00277         STACK_TRACE_END
00278 }
00279 boolean gnSequence::ToArray( gnSeqC* pSeqC, gnSeqI length, const gnSeqI offset ) const
00280 {
00281         STACK_TRACE_START
00282                 gnSeqI real_offset = offset - 1;
00283                 if(offset == GNSEQI_END)
00284                         return false;
00285                 gnSeqC* tmp = new gnSeqC[length];
00286                 boolean success = spec->SeqRead( real_offset, tmp, length, ALL_CONTIGS);
00287                 
00288                 //now filter the array
00289                 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin();
00290                 gnSeqC** tomp = &tmp;
00291                 for(; iter != filter_list.end(); iter++)
00292                         (*iter)->Filter(tomp, length);
00293                 memcpy(pSeqC, *tomp, length);
00294                 delete[] *tomp;
00295                 
00296                 if( success )
00297                         return true;
00298 
00299                 return false;
00300         STACK_TRACE_END
00301 }
00302 gnSeqC gnSequence::GetSeqC( const gnSeqI offset ) const
00303 {
00304         STACK_TRACE_START
00305                 char block;
00306                 gnSeqI readLen = 1;
00307                 gnSeqI real_offset = offset - 1;
00308                 boolean success = spec->SeqRead( real_offset, &block, readLen, ALL_CONTIGS);
00309 
00310                 //now filter the char
00311                 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin();
00312                 for(; iter != filter_list.end(); iter++)
00313                         block = (*iter)->Filter(block);
00314 
00315                 if( success )
00316                         return block;
00317 
00318                 return GNSEQC_NULL;
00319         STACK_TRACE_END
00320 }
00321 gnSeqC gnSequence::operator[]( const gnSeqI offset ) const
00322 {
00323         STACK_TRACE_START
00324                 return GetSeqC( offset );
00325         STACK_TRACE_END
00326 }
00327 
00328 gnSeqI gnSequence::contigListSize() const{
00329         STACK_TRACE_START
00330                 return spec->GetSpecListLength();
00331         STACK_TRACE_END
00332 }
00333 gnSeqI gnSequence::contigListLength() const{
00334         STACK_TRACE_START
00335                 return spec->GetSpecListLength();
00336         STACK_TRACE_END
00337 }
00338 
00339 //find the contig associated with base
00340 uint32 gnSequence::contigIndexByBase( const gnSeqI baseI) const{
00341         STACK_TRACE_START
00342                 return spec->GetSpecIndexByBase(baseI-1);
00343         STACK_TRACE_END
00344 }
00345 gnSequence gnSequence::contig( const uint32 contigI) const{
00346         STACK_TRACE_START
00347                 if(contigI == ALL_CONTIGS)
00348                         return *this;
00349                 return gnSequence(*spec->GetSpec(contigI));
00350         STACK_TRACE_END
00351 }
00352 //returns a gnSequence pointer containing the specified contig.
00353 gnSequence gnSequence::contigByBase( const gnSeqI baseI) const{
00354         STACK_TRACE_START
00355                 return gnSequence(*spec->GetSpecByBase(baseI-1));
00356         STACK_TRACE_END
00357 }
00358 uint32 gnSequence::contigIndexByName( string& contigName) const{
00359         STACK_TRACE_START
00360                 return spec->GetSpecIndexByName(contigName);
00361         STACK_TRACE_END
00362 }
00363 
00364 gnSeqI gnSequence::contigStart( const uint32 contigI) const{
00365         STACK_TRACE_START
00366                 int64 leafI = contigI;
00367                 // add one for the stupid geneticists whose indices start at 1
00368                 return spec->GetSpecStartBase( leafI ) + 1;
00369         STACK_TRACE_END
00370 }
00371 
00372 gnSeqI gnSequence::contigLength( const uint32 contigI) const{
00373         STACK_TRACE_START
00374                 return spec->GetSpec( contigI )->GetLength();
00375         STACK_TRACE_END
00376 }
00377 
00378 string gnSequence::contigName( const uint32 contigI) const{
00379         STACK_TRACE_START
00380                 return spec->GetSpec(contigI)->GetName();
00381         STACK_TRACE_END
00382 }
00383 
00384 void gnSequence::globalToLocal(uint32& contigI, gnSeqI& baseI) const{
00385         STACK_TRACE_START
00386                 contigI = contigIndexByBase(baseI);
00387                 baseI -= (contigStart(contigI) - 1);
00388         STACK_TRACE_END
00389 }
00390 
00391 void gnSequence::localToGlobal(const uint32 contigI, gnSeqI& baseI) const{
00392         STACK_TRACE_START
00393                 if(baseI > contigLength(contigI))
00394                         Throw_gnEx(SeqIndexOutOfBounds());
00395                 baseI += contigStart(contigI) - 1;
00396         STACK_TRACE_END
00397 }
00398 
00399 void gnSequence::globalToSource(uint32& contigI, gnSeqI& baseI) const{
00400         STACK_TRACE_START
00401                 baseI--;        //convert from geneticist coordinates
00402                 gnSeqI seq_contig = spec->GetSpecIndexByBase(baseI);
00403                 gnSeqI new_base = baseI - spec->GetSpecStartBase(seq_contig);
00404                 gnFragmentSpec *fragment_spec = spec->GetSpec(seq_contig);
00405                 seq_contig = fragment_spec->GetSpecIndexByBase(new_base);
00406                 new_base = new_base - fragment_spec->GetSpecStartBase(seq_contig);
00407                 gnContigSpec* contig_spec = fragment_spec->GetSpec(seq_contig);
00408                 contigI = contig_spec->GetSourceContigIndex();
00409                 gnSeqI contig_start = contig_spec->GetStart();
00410                 if(contig_spec->IsReverseComplement()){
00411                         gnSeqI source_len = contig_spec->GetSourceLength();
00412                         //it may seem counter intuitive, but we can add the source_len and mod by source_len
00413                         //to deal with circularity
00414         //              contig_start = (contig_start - contig_spec->GetLength() + source_len) % source_len;
00415                         //if we are reverse complement then the contig_start will be the ending base pair
00416                         //we want to subtract new_base from it. and we can add/mod by source_len to deal
00417                         //with circles.
00418                         baseI = (contig_start - new_base - 1 + source_len) % source_len;
00419                 }else
00420                         baseI =  contig_start + new_base + 1;
00421         STACK_TRACE_END
00422 }
00423 
00424 void gnSequence::localToSource(uint32& contigI, gnSeqI& baseI) const{
00425         STACK_TRACE_START
00426                 localToGlobal(contigI, baseI);
00427                 globalToSource(contigI, baseI);
00428         STACK_TRACE_END
00429 }
00430 
00431 gnSequence gnSequence::contigByName( string& contigName) const{
00432         STACK_TRACE_START
00433                 uint32 contigIndex = spec->GetSpecIndexByName(contigName);
00434                 return gnSequence(*spec->GetSpec(contigIndex));
00435         STACK_TRACE_END
00436 }
00437 
00438 void gnSequence::setContigName( const uint32 contigI, const string& contig_name) {
00439         STACK_TRACE_START
00440                 if(contigI == ALL_CONTIGS)
00441                         spec->SetName(contig_name);
00442                 else
00443                         spec->GetSpec(contigI)->SetName(contig_name);
00444         STACK_TRACE_END
00445 }
00446 
00447 void gnSequence::setReverseComplement( const boolean revComp, const uint32 contigI){
00448         STACK_TRACE_START
00449                 if(contigI == ALL_CONTIGS)
00450                         spec->SetReverseComplement(revComp);
00451                 else{
00452                         gnFragmentSpec* contig = spec->GetSpec(contigI);
00453                         contig->SetReverseComplement(revComp);
00454                 }
00455         STACK_TRACE_END
00456 }
00457 
00458 boolean gnSequence::isReverseComplement( const uint32 contigI ){
00459         STACK_TRACE_START
00460                 if(contigI == ALL_CONTIGS)
00461                         return spec->IsReverseComplement();
00462                 gnFragmentSpec* contig = spec->GetSpec(contigI);
00463                 return contig->IsReverseComplement();
00464         STACK_TRACE_END
00465 }
00466 
00467 uint32 gnSequence::getHeaderListLength(const uint32 contigI) const{
00468         STACK_TRACE_START
00469                 if(contigI == ALL_CONTIGS)
00470                         return spec->GetHeaderListLength();
00471                 else{
00472                         return spec->GetSpec(contigI)->GetHeaderListLength();
00473                 }
00474         STACK_TRACE_END
00475 }
00476 
00477 gnBaseHeader* gnSequence::getHeader(const uint32 contigI, const uint32 headerI) const{
00478         STACK_TRACE_START
00479                 if(contigI == ALL_CONTIGS)
00480                         return spec->GetHeader(headerI);
00481                 else{
00482                         return spec->GetSpec(contigI)->GetHeader(headerI);
00483                 }
00484         STACK_TRACE_END
00485 }
00486 
00487 void gnSequence::addHeader(const uint32 contigI, gnBaseHeader* header, const uint32 headerI){
00488         STACK_TRACE_START
00489                 if(contigI == ALL_CONTIGS)
00490                         spec->AddHeader(header, headerI);
00491                 else{
00492                         spec->GetSpec(contigI)->AddHeader(header, headerI);
00493                 }
00494         STACK_TRACE_END
00495 }
00496 
00497 void gnSequence::removeHeader(const uint32 contigI, const uint32 headerI){
00498         STACK_TRACE_START
00499                 if(contigI == ALL_CONTIGS)
00500                         spec->RemoveHeader(headerI);
00501                 else
00502                         spec->GetSpec(contigI)->RemoveHeader(headerI);
00503         STACK_TRACE_END
00504 }
00505 
00506 void gnSequence::merge(const gnSeqI startBase, const gnSeqI endBase){
00507         STACK_TRACE_START
00508         STACK_TRACE_END
00509 }
00510 
00511 void gnSequence::mergeContigs(const uint32 startC, const uint32 endC){
00512         STACK_TRACE_START
00513                 spec->MergeFragments(startC, endC);
00514         STACK_TRACE_END
00515 }
00516 
00517 bool gnSequence::LoadSource(const string sourcename){
00518         STACK_TRACE_START
00519                 gnSourceFactory* m_sourceFactory = gnSourceFactory::GetSourceFactory();
00520                 gnBaseSource *m_pSource = m_sourceFactory->AddSource(sourcename, true);
00521                 if (m_pSource!=NULL)
00522                 {
00523                         if(spec != NULL)
00524                                 delete spec;
00525                         spec = m_pSource->GetSpec();
00526                         return true;
00527                 }
00528                 return false;
00529         STACK_TRACE_END
00530 }
00531 
00532 gnSeqI gnSequence::find(const gnSequence& search, const gnSeqI offset)const{
00533         STACK_TRACE_START
00534                 //this is really adHoc... should probably be switched
00535                 string searchIn=ToString();
00536                 string find= search.ToString();
00537                 string::size_type pos = searchIn.find(find, offset);
00538                 if (pos == string::npos)
00539                         return GNSEQI_ERROR;
00540                 else{
00541                         return pos++; 
00542                 }
00543         STACK_TRACE_END
00544 }

Generated on Mon Mar 28 06:00:21 2005 for libGenome by doxygen 1.3.6