libMems/CompactGappedAlignment.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: CompactGappedAlignment.h,v 1.12 2004/04/19 23:10:50 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef __CompactGappedAlignment_h__
00010 #define __CompactGappedAlignment_h__
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include "libGenome/gnDebug.h"
00017 #include "libGenome/gnFilter.h"
00018 #include "libGenome/gnSequence.h"
00019 #include "libMems/SparseAbstractMatch.h"
00020 #include "libMems/HybridAbstractMatch.h"
00021 #include "libMems/AbstractGappedAlignment.h"
00022 #include "libMems/UngappedLocalAlignment.h"
00023 
00024 #include <algorithm>
00025 
00026 #ifdef WIN32
00027 #include "windows.h"
00028 #endif
00029 
00030 namespace mems {
00031 
00037 template< class BaseType = AbstractGappedAlignment< HybridAbstractMatch<> > >
00038 class CompactGappedAlignment : public BaseType
00039 {
00040 public:
00041         CompactGappedAlignment() : BaseType(){};
00042         CompactGappedAlignment( uint seq_count, gnSeqI align_length );
00043         CompactGappedAlignment( std::vector< bitset_t >& aln_mat, gnSeqI alignment_length );
00044         
00045         template< class MatchType >
00046         CompactGappedAlignment( MatchType& m ) : 
00047                 BaseType( m.SeqCount(), m.AlignmentLength() ),
00048                 bcount( std::vector< std::vector< size_t > >( m.SeqCount() ) )
00049         {
00050                 m.GetAlignment(align_matrix);
00051 
00052                 for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00053                 {
00054                         this->SetStart(seqI, m.Start(seqI));
00055                         if( m.Start(seqI) != NO_MATCH )
00056                                 this->SetLength(m.Length(seqI), seqI);
00057                         else
00058                                 this->SetLength(0, seqI);
00059                 }
00060 
00061                 this->create_bitcount();
00062 
00063                 if( !this->validate() )
00064                         std::cerr << "kahnstruct error\n";
00065         }
00066 
00067         CompactGappedAlignment* Clone() const { return new CompactGappedAlignment( *this ); }
00068         CompactGappedAlignment* Copy() const;
00069         virtual void Free();
00070         
00071         void SetAlignment( const std::vector< std::string >& seq_align );
00072 
00073         void SetAlignment( std::vector< bitset_t >& seq_align );
00074 
00075         // Inherited methods from AbstractMatch:
00076         virtual void Invert();
00077         virtual void CropStart(gnSeqI crop_amount);
00078         virtual void CropEnd(gnSeqI crop_amount);
00079 
00080         virtual void CropLeft(gnSeqI crop_amount, uint seqI);
00081         virtual void CropRight(gnSeqI crop_amount, uint seqI);
00082 
00083         void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
00084 
00086         const std::vector< bitset_t >& GetAlignment() const{ return align_matrix; }
00087 
00088 //      friend void GetAlignment( const CompactGappedAlignment& ga, const std::vector< genome::gnSequence* >& seq_table, std::vector<std::string>& alignment );
00089         
00090         void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
00091 
00093         virtual bool IsGap( uint seq, gnSeqI col ) const;
00095         void translate( CompactGappedAlignment& cga, uint cga_seq, uint my_seq, bool add_bits = true );
00096 
00097         bool validate() const;
00098         bool validate_bitcount() const;
00099 
00100         void copyRange( CompactGappedAlignment& dest, gnSeqI left_column, gnSeqI length );
00101         gnSeqI SeqPosToColumn( uint seq, int64 pos);
00102 
00104         void CondenseGapColumns();
00105 
00106         void swap( CompactGappedAlignment& other ){ swap(&other); }
00107 
00108 protected:
00109         // for use by derived classes in order to swap contents
00110         void swap( CompactGappedAlignment* other ){
00111                 std::swap( align_matrix, other->align_matrix );
00112                 std::swap( bcount, other->bcount );
00113                 BaseType::swap( other );
00114         }
00115 
00116         std::vector< bitset_t > align_matrix;           
00117         std::vector< std::vector< size_t > > bcount;
00118 
00119         void create_bitcount();
00120         gnSeqI SeqPosToColumn( gnSeqI pos, const bitset_t& bvec, const std::vector< size_t >& index ) const;
00121 
00122 };
00123 
00124 static bool debug_cga = false;
00125 
00126 template< class BaseType >
00127 CompactGappedAlignment<BaseType>* CompactGappedAlignment<BaseType>::Copy() const
00128 {
00129         return m_allocateAndCopy( *this );
00130 }
00131 
00132 template< class BaseType >
00133 void CompactGappedAlignment<BaseType>::Free()
00134 {
00135         m_free(this);
00136 }
00137 
00138 template< class BaseType >
00139 bool CompactGappedAlignment<BaseType>::validate() const
00140 {
00141         if( !debug_cga )
00142                 return true;
00143         bool good = true;
00144         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00145         {
00146                 if( this->AlignmentLength() != align_matrix[seqI].size() )
00147                 {
00148                         good = false;
00149                         std::cerr << "vanishing pig trick\n";
00150                         genome::breakHere();
00151                 }
00152                 gnSeqI count = align_matrix[seqI].count();
00153                 if( count > 0 && this->LeftEnd(seqI) == 0 )
00154                 {
00155                         good = false;
00156                         std::cerr << "boner_McHoserknob\n";
00157                         genome::breakHere();
00158                 }
00159                 if( (count == 0 || this->Length(seqI) == 0) && this->LeftEnd(seqI) != 0 )
00160                 {
00161                         good = false;
00162                         std::cerr << "Length(" << seqI << "): " << this->Length(seqI) << std::endl;
00163                         std::cerr << "LeftEnd(seqI): " << this->LeftEnd(seqI) << std::endl;
00164                         std::cerr << "spumante explosion\n";
00165                         genome::breakHere();
00166                 }
00167                 if( count != this->Length(seqI) )
00168                 {
00169                         std::cerr << "seqI: " << seqI << " count: " << count << "  Length(seqI): " << this->Length(seqI) << std::endl;
00170                         std::cerr << "LeftEnd(seqI): " << this->LeftEnd(seqI) << std::endl;
00171                         std::cerr << "lendo mismatcho\n";
00172                         genome::breakHere();
00173                         return false;
00174                 }
00175 //              std::vector< std::vector< size_t > > tmp_bcount = bcount;
00176 //              create_bitcount();
00177 //              if( !tmp_bcount == bcount )
00178 //              {
00179 //                      good = false;
00180 //                      std::cerr << "bcount mismatch!!!\n";
00181 //              }
00182 //              bcount = tmp_bcount;
00183 
00184         }
00185         if( good )      // check for all gap cols
00186         {
00187 /*      allow gap cols...
00188                 for( size_t colI = 0; colI < this->AlignmentLength(); ++colI )
00189                 {
00190                         bool aa = false;
00191                         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00192                                 aa = aa || align_matrix[seqI].test(colI);
00193                         if( aa == false )
00194                         {
00195                                 std::cerr << "gap col at " << colI << std::endl;
00196                                 genome::breakHere();
00197                         }
00198                 }
00199                 */
00200         }
00201         return  validate_bitcount() && good;
00202 }
00203 
00204 
00205 template< class BaseType >
00206 CompactGappedAlignment<BaseType>::CompactGappedAlignment( std::vector< bitset_t >& aln_mat, gnSeqI alignment_length ) :
00207 BaseType( aln_mat.size(), alignment_length ),
00208 align_matrix( aln_mat ),
00209 bcount( std::vector< std::vector< size_t > >( aln_mat.size() ) )
00210 {
00211         this->create_bitcount();
00212         this->validate_bitcount();
00213 }
00214 
00215 template< class BaseType >
00216 CompactGappedAlignment<BaseType>::CompactGappedAlignment( uint seq_count, gnSeqI align_length ) : 
00217 BaseType( seq_count, align_length )
00218 {}
00219 
00220 
00221 template< class BaseType >
00222 void CompactGappedAlignment<BaseType>::SetAlignment( const std::vector< std::string >& seq_align ){
00223         if( seq_align.size() == 0 )
00224         {
00225                 this->SetAlignmentLength(0);
00226                 return;
00227         }
00228         this->SetAlignmentLength(seq_align[0].size());
00229         align_matrix = std::vector< bitset_t >( seq_align.size(), bitset_t( seq_align[0].size(), false ) );
00230         bcount = std::vector< std::vector<size_t> >( seq_align.size() );
00231         for( size_t seqI = 0; seqI < seq_align.size(); seqI++ )
00232         {
00233                 bool nonzero = false;
00234                 for( size_t charI = 0; charI < seq_align[seqI].size(); charI++ )
00235                         if( seq_align[seqI][charI] != '-' )
00236                         {
00237                                 align_matrix[seqI].set(charI);
00238                                 nonzero = true;
00239                         }
00240         }
00241         this->create_bitcount();
00242 }
00243 
00244 template< class BaseType >
00245 void CompactGappedAlignment<BaseType>::SetAlignment( std::vector< bitset_t >& seq_align )
00246 {
00247         std::swap( align_matrix, seq_align );
00248         seq_align.clear();
00249         if( align_matrix.size() > 0 )
00250                 this->SetAlignmentLength( align_matrix[0].size() );
00251         else
00252                 this->SetAlignmentLength(0);
00253         bcount = std::vector< std::vector<size_t> >(align_matrix.size());
00254         this->create_bitcount();
00255         this->validate_bitcount();
00256 }
00257 
00258 template< class BaseType >
00259 void CompactGappedAlignment<BaseType>::GetAlignment( std::vector< bitset_t >& align_matrix ) const
00260 {
00261         align_matrix = this->align_matrix;
00262 }
00263 
00264 template< class BaseType >
00265 bool CompactGappedAlignment<BaseType>::IsGap( uint seq, gnSeqI col ) const
00266 {
00267         return !align_matrix[seq][col];
00268 }
00269 
00270 static const unsigned INDEX_INTERVAL = 512;
00271 
00272 template< class BaseType >
00273 void CompactGappedAlignment<BaseType>::create_bitcount()
00274 {
00275         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00276         {
00277 //              if( this->LeftEnd(seqI) == NO_MATCH )
00278 //                      continue;
00279                 bitset_t& bvec = align_matrix[seqI];
00280                 bcount[seqI].clear();
00281                 bcount[seqI].push_back(0);
00282                 for( size_t indie = 0; indie + INDEX_INTERVAL <= bvec.size(); indie += INDEX_INTERVAL )
00283                 {
00284                         size_t end = indie + INDEX_INTERVAL;
00285                         size_t ct = 0;
00286                         for( size_t i = indie; i < end; ++i )
00287                                 ct += bvec.test(i);
00288                         bcount[seqI].push_back( ct + bcount[seqI].back() );
00289                 }
00290         }
00291 }
00292 
00293 template< class BaseType >
00294 bool CompactGappedAlignment<BaseType>::validate_bitcount() const
00295 {
00296         if( !debug_cga )
00297                 return true;
00298         bool valid = true;      // innocent until proven guilty
00299         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00300         {
00301                 gnSeqI count = align_matrix[seqI].count();
00302                 size_t bc_len = align_matrix[seqI].size() / INDEX_INTERVAL;
00303                 if( count < INDEX_INTERVAL && bcount[seqI].size() == 0 )
00304                         continue;       // a-ok here
00305                 if( bc_len + 1 != bcount[seqI].size() && (bcount[seqI].back() % INDEX_INTERVAL != 0) )
00306                 {
00307                         std::cerr << "bitcount problem, bc_len + 1: " << bc_len + 1 << " and bcount[seqI].size(): " << bcount[seqI].size() << std::endl;
00308                         std::cerr << "count: " << count << " and bcount[seqI].back(): " << bcount[seqI].back() << std::endl;
00309                         valid = false;
00310                 }
00311                 if( count - bcount[seqI].back() > INDEX_INTERVAL )
00312                 {
00313                         std::cerr << "bitcount problem, count: " << count << " and bcount[seqI].back(): " << bcount[seqI].back() << std::endl;
00314                         valid = false;
00315                 }
00316         }
00317         return valid;
00318 }
00319 
00320 template< class BaseType > 
00321 gnSeqI CompactGappedAlignment<BaseType>::SeqPosToColumn( uint seq, int64 pos )
00322 {
00323         if( this->Orientation(seq) == AbstractMatch::forward )
00324                 pos = genome::absolut(pos) - this->LeftEnd(seq) + 1;
00325         else
00326                 pos = this->RightEnd(seq)-genome::absolut(pos) + 1;     // is this right?
00327         return SeqPosToColumn(pos, align_matrix[seq], bcount[seq]);
00328 }
00329 
00330 template< class BaseType > 
00331 gnSeqI CompactGappedAlignment<BaseType>::SeqPosToColumn( gnSeqI pos, const bitset_t& bvec, const std::vector< size_t >& index ) const
00332 {
00333         std::vector<size_t>::const_iterator iter = std::lower_bound(index.begin(), index.end(), pos);
00334         --iter;
00335         size_t cur_pos = *iter;
00336         size_t col = iter - index.begin();
00337         col *= INDEX_INTERVAL;
00338         if( col == 0 )
00339                 col = bvec.find_first();
00340         else
00341                 col = bvec.find_next(col-1);
00342         for( ++cur_pos; cur_pos < pos; ++cur_pos )
00343                 col = bvec.find_next(col);
00344         return col;
00345 }
00346 
00347 template< class BaseType >
00348 void CompactGappedAlignment<BaseType>::translate( CompactGappedAlignment& cga, uint cga_seq, uint my_seq, bool add_bits ) // const
00349 {
00350         AbstractMatch::orientation my_orient = this->Orientation(my_seq);
00351 
00352         if( cga.Length(cga_seq)  > this->Length(my_seq) )
00353         {
00354                 std::cerr << "Oh scheisskopf.  What are you trying to do to me??\n";
00355                 std::cerr << "cga.Length(" << cga_seq << "): " << cga.Length(cga_seq) << std::endl;
00356                 std::cerr << "Length(" << my_seq << "): " << this->Length(my_seq) << std::endl;
00357                 genome::breakHere();
00358         }
00359 
00360         gnSeqI prev_lend = cga.LeftEnd(cga_seq);
00361         gnSeqI prev_len = cga.Length(cga_seq);
00362         gnSeqI my_lend = this->LeftEnd(my_seq);
00363         gnSeqI my_len = this->Length(my_seq);
00364         gnSeqI my_count = 0;
00365         uint seqI = 0;
00366 
00367         // what assumptions should be made about cga?
00368         // does it already have the correct left-end relative to this?
00369         // no, it needs to have a left-end relative to the first aligned char in this
00370         size_t cur_bit = 0;
00371 
00372         // determine left_bit
00373         size_t left_bit = this->SeqPosToColumn(cga.LeftEnd(cga_seq), align_matrix[my_seq], bcount[my_seq]);
00374         // determine right_bit
00375         size_t right_bit = this->SeqPosToColumn(cga.RightEnd(cga_seq), align_matrix[my_seq], bcount[my_seq]);
00376         if( right_bit > 4000000000u )
00377         {
00378                 std::cerr << "cga doesn't fit\n";
00379                 std::cerr << "cga.RightEnd(cga_seq) " << cga.RightEnd(cga_seq) << std::endl;
00380                 std::cerr << "RightEnd(my_seq): " << this->RightEnd(my_seq) << std::endl;
00381                 std::cerr << "cga.LeftEnd(cga_seq) " << cga.LeftEnd(cga_seq) << std::endl;
00382                 std::cerr << "LeftEnd(my_seq): " << this->LeftEnd(my_seq) << std::endl;
00383                 std::cerr << "cga.AlignmentLength(): " << cga.AlignmentLength() << std::endl;
00384                 std::cerr << "AlignmentLength(): " << this->AlignmentLength() << std::endl;
00385                 genome::breakHere();
00386         }
00387         right_bit++;
00388         if( right_bit == 0 )
00389                 right_bit = this->AlignmentLength();
00390 
00391         cga.SetLeftEnd(cga_seq,left_bit+1);
00392 
00393         // add on length of unaligned left and right sides
00394         size_t cga_left = cga.align_matrix[cga_seq].find_first();
00395 
00396         size_t somesize = (right_bit - left_bit) - cga.Length(cga_seq) + cga.AlignmentLength();
00397 
00398         size_t cga_bit = cga_left;
00399         size_t my_bit = left_bit;
00400         size_t xlat_bit = cga_left;
00401         size_t added_bits = 0;
00402         // copy in everything up to cga_left
00403         std::vector< bitset_t > xrated( cga.SeqCount(), bitset_t( somesize, false ) );
00404         for( size_t seqI = 0; seqI < xrated.size(); ++seqI )
00405                 for( size_t asdf = cga.align_matrix[seqI].find_first(); asdf < cga_left; asdf = cga.align_matrix[seqI].find_next(asdf) )
00406                         xrated[seqI].set(asdf);
00407         
00408         while(xlat_bit < somesize)
00409         {
00410                 // assume that align_matrix[my_seq][my_bit] is set
00411                 if( !align_matrix[my_seq].test(my_bit) )
00412                 {
00413                         std::cerr << "ohhhhhhzheiss!\n";
00414                         genome::breakHere();
00415                 }
00416                 // copy the column in cga
00417                 for( size_t seqI = 0; seqI < xrated.size(); ++seqI )
00418                         xrated[seqI].set( xlat_bit, cga.align_matrix[seqI].test(cga_bit) );
00419 
00420                 ++cga_bit;
00421                 ++xlat_bit;
00422 
00423                 if( xlat_bit >= somesize )
00424                         break;
00425 
00426                 // TODO: should this condition be replaced by cropping xlat_bit + diff - 1 down to < somesize?
00427                 if( cga.align_matrix[cga_seq].test(cga_bit) )
00428                 {
00429                         size_t next_bit = align_matrix[my_seq].find_next(my_bit);
00430                         if( next_bit > 4000000000u )
00431                                 genome::breakHere();
00432                         size_t diff = next_bit - my_bit;
00433                         if( diff > 1 && add_bits )
00434                         {
00435                                 if( xlat_bit + diff - 1 >= somesize )
00436                                 {
00437                                         std::cerr << "ERRRORRR porker!!\n";
00438                                         genome::breakHere();
00439                                 }
00440                                 for( size_t i = xlat_bit; i < xlat_bit + diff - 1; ++i )
00441                                         xrated[cga_seq].set(i);
00442                                 added_bits += diff-1;
00443                         }
00444                         my_bit = next_bit;
00445                         xlat_bit += diff - 1;
00446                 }
00447         }
00448 
00449         cga.align_matrix = xrated;
00450         cga.create_bitcount();
00451         cga.SetLength(cga.Length(cga_seq)+added_bits,cga_seq);
00452         cga.SetAlignmentLength(somesize);
00453         if( !cga.validate() )
00454         {
00455                 std::cerr << "prev_lend: " << prev_lend << std::endl;
00456                 std::cerr << "prev_len: " << prev_len << std::endl;
00457                 std::cerr << "translate error\n";
00458                 genome::breakHere();
00459         }
00460 }
00461 
00462 
00463 template< class BaseType >
00464 void CompactGappedAlignment<BaseType>::Invert(){
00465         for(uint seqI = 0; seqI < this->SeqCount(); seqI++)
00466         {
00467                 if( this->LeftEnd(seqI) == NO_MATCH )
00468                         continue;
00469                 bitset_t& fwd = align_matrix[seqI];
00470                 bitset_t rev(this->AlignmentLength());
00471                 size_t r = this->AlignmentLength();
00472                 for( size_t i = 0; i < fwd.size(); ++i )
00473                         rev.set( --r, fwd.test(i) );
00474                 fwd.swap(rev);
00475         }
00476         this->create_bitcount();
00477         BaseType::Invert();
00478         if( !this->validate() )
00479         {
00480                 std::cerr << "invert error\n";
00481         }
00482 }
00483 
00484 template< class BaseType >
00485 void CompactGappedAlignment<BaseType>::CropStart(gnSeqI crop_amount){
00486         if( crop_amount > this->AlignmentLength() )
00487                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00488         if( crop_amount == 0 )
00489                 return;
00490 
00491         gnSeqI pre_alignlen = this->AlignmentLength();
00492         gnSeqI pre_lend0 = this->LeftEnd(0);
00493 
00494         std::vector<gnSeqI> pos;
00495         std::vector<bool> column;
00496         GetColumn( crop_amount-1, pos, column );
00497 
00498         for( uint i=0; i < this->SeqCount(); i++ ){
00499                 if( this->LeftEnd(i) == NO_MATCH )
00500                 {
00501                         align_matrix[i].resize(this->AlignmentLength()-crop_amount);
00502                         align_matrix[i] = align_matrix[i];      // force reallocation on "optimized" windows builds
00503                         continue;
00504                 }
00505 
00506                 align_matrix[i] >>= crop_amount;        // why not shift left?  is this a bug in boost::dynamic_bitset?
00507                 align_matrix[i].resize(this->AlignmentLength()-crop_amount);
00508                 align_matrix[i] = align_matrix[i];      // force reallocation on "optimized" windows builds
00509                 size_t char_count = this->Orientation(i) == AbstractMatch::forward ? pos[i] - this->LeftEnd(i) + 1 : this->RightEnd(i) - pos[i] + 1;
00510 
00511                 if( pos[i] > 0 && char_count > 0 )
00512                 {
00513                         this->SetLength(this->Length(i)-char_count, i);
00514                         if( this->Length(i) == 0 )
00515                                 this->SetStart(i, NO_MATCH);
00516                         if( this->Orientation(i) == AbstractMatch::forward )
00517                                 this->SetStart(i, this->Start(i) + char_count);
00518                 }else if( pos[i] == 0 && this->Orientation(i) == AbstractMatch::reverse )
00519                 {
00520                         // this sequence was completely obliterated by the crop
00521                         this->SetLength(0, i);
00522                         this->SetStart(i, NO_MATCH);
00523                 }
00524         }
00525 
00526         this->SetAlignmentLength( this->AlignmentLength() - crop_amount );
00527         this->create_bitcount();
00528         if( !this->validate() )
00529         {
00530                 std::cerr << "pre_lend0: " << pre_lend0 << std::endl;
00531                 std::cerr << "pre_alignlen: " << pre_alignlen << std::endl;
00532                 std::cerr << "CropStart error\n";
00533         }
00534 }
00535 
00536 template< class BaseType >
00537 void CompactGappedAlignment<BaseType>::CropEnd(gnSeqI crop_amount){
00538         if( crop_amount > this->AlignmentLength() )
00539                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00540         if( crop_amount == 0 )
00541                 return;
00542 
00543         std::vector<gnSeqI> pos;
00544         std::vector<bool> column;
00545         this->GetColumn( this->AlignmentLength()-crop_amount, pos, column );
00546 
00547         for( uint i=0; i < this->SeqCount(); i++ ){
00548                 align_matrix[i].resize( this->AlignmentLength() - crop_amount );
00549                 align_matrix[i] = align_matrix[i];      // force reallocation on "optimized" windows builds
00550                 if( this->LeftEnd(i) == NO_MATCH )
00551                         continue;
00552                 AbstractMatch::orientation orient = this->Orientation(i);
00553                 if( pos[i] > 0 )
00554                 {
00555                         gnSeqI char_count = pos[i] - (orient == AbstractMatch::forward ? (column[i] ? 1 : 0 ) : (column[i] ? 0 : 1 ) );
00556                         char_count = orient == AbstractMatch::forward ? char_count - this->LeftEnd(i) + 1 : this->RightEnd(i) - char_count;
00557                         if( char_count == 0 && align_matrix[i].count() > 0)
00558                         {
00559                                 std::cerr << "orienatation: " << (orient == AbstractMatch::forward ? "forward\n" : (orient == AbstractMatch::reverse ? "reverse\n" : "undef\n"));
00560                                 std::cerr << "lend: " << this->LeftEnd(i) << std::endl;
00561                                 std::cerr << "length: " << this->Length(i) << std::endl;
00562                                 std::cerr << "count: " << align_matrix[i].count() << std::endl;
00563                         }
00564                         gnSeqI deleted = this->Length(i) - char_count;
00565                         this->SetLength(char_count, i);
00566                         if( this->Length(i) == 0 )
00567                                 this->SetStart(i, 0);
00568                         if( this->Start(i) < 0 )
00569                                 this->SetStart(i, this->Start(i)-deleted);
00570                 }else if( orient == AbstractMatch::forward ){
00571                         this->SetLength(0, i);
00572                         this->SetStart(i, 0);
00573                 }
00574         }
00575         SetAlignmentLength( this->AlignmentLength() - crop_amount );
00576         this->create_bitcount();
00577         if( !this->validate() )
00578                 std::cerr << "CropEnd error\n";
00579 }
00580 
00581 template< class BaseType >
00582 void CompactGappedAlignment<BaseType>::CropLeft(gnSeqI crop_amount, uint seqI)
00583 {
00584         if( crop_amount == 0 )
00585                 return;
00586 
00587         gnSeqI pre_len = this->Length(seqI);
00588         // count "crop_amount" characters into seqI and crop there
00589         if( this->Orientation(seqI) == AbstractMatch::forward )
00590         {
00591                 size_t left_col = this->SeqPosToColumn(crop_amount, align_matrix[seqI], bcount[seqI]) + 1;
00592                 this->CropStart(left_col);
00593         }else{
00594                 size_t left_col = this->SeqPosToColumn(this->Length(seqI) - crop_amount + 1, align_matrix[seqI], bcount[seqI]);
00595                 if( left_col > 4000000000u )
00596                 {
00597                         std::cerr << this->LeftEnd(seqI) << std::endl;
00598                         std::cerr << this->LeftEnd(0) << std::endl;
00599                         std::cerr << "bogus cropper cga\n";
00600                 }
00601                 this->CropEnd(this->AlignmentLength()-left_col);
00602         }
00603         if( this->Length(seqI) != pre_len - crop_amount )
00604         {
00605                 std::cerr << this->LeftEnd(seqI) << std::endl;
00606                 std::cerr << this->LeftEnd(0) << std::endl;
00607                 std::cerr << "bad cropperLeftie\n";
00608         }
00609         if( !this->validate() )
00610                 std::cerr << "CropLeft error\n";
00611 }
00612 
00613 template< class BaseType >
00614 void CompactGappedAlignment<BaseType>::CropRight(gnSeqI crop_amount, uint seqI)
00615 {
00616         if( crop_amount == 0 )
00617                 return;
00618 
00619         gnSeqI pre_len = this->Length(seqI);
00620         gnSeqI pre_lend = this->LeftEnd(seqI);
00621         gnSeqI pre_lend0 = this->LeftEnd(0);
00622         if( this->Orientation(seqI) == AbstractMatch::forward )
00623         {
00624                 // count "crop_amount" characters into seqI and crop there
00625                 size_t right_col = this->SeqPosToColumn(this->Length(seqI) - crop_amount + 1, align_matrix[seqI], bcount[seqI]);
00626                 this->CropEnd( this->AlignmentLength()-right_col );
00627         }else
00628         {
00629                 size_t right_col = this->SeqPosToColumn(crop_amount, align_matrix[seqI], bcount[seqI]) + 1;
00630                 if( right_col > 4000000000u )
00631                 {
00632                         std::cerr << this->LeftEnd(seqI) << std::endl;
00633                         std::cerr << this->LeftEnd(0) << std::endl;
00634                         std::cerr << "bogus cropper cga\n";
00635                 }
00636                 this->CropStart( right_col );
00637         }
00638         if( this->Length(seqI) != pre_len - crop_amount )
00639         {
00640                 std::cerr << this->LeftEnd(seqI) << std::endl;
00641                 std::cerr << this->LeftEnd(0) << std::endl;
00642                 std::cerr << "bad cropperight\n";
00643         }
00644         if( !this->validate() )
00645                 std::cerr << "CropRight error\n";
00646 }
00647 
00648 template< class BaseType >
00649 void CompactGappedAlignment<BaseType>::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
00650 {
00651         pos = std::vector<gnSeqI>(this->SeqCount(), NO_MATCH);
00652         column = std::vector<bool>(this->SeqCount(), false);
00653         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00654         {
00655                 if( align_matrix[seqI][col] )
00656                         column[seqI] = true;
00657 
00658                 gnSeqI count = 0;
00659                 if( this->LeftEnd(seqI) != NO_MATCH )
00660                 {
00661                         size_t col_index = col / INDEX_INTERVAL;
00662                         for( size_t i = col_index * INDEX_INTERVAL; i <= col; i++ )
00663                                 count += align_matrix[seqI].test(i);
00664                         count += bcount[seqI][col_index];
00665                 }
00666 
00667                 if( count > 0 && this->Orientation(seqI) == AbstractMatch::forward )
00668                         pos[seqI] = this->LeftEnd(seqI) + count - 1;
00669                 else if( this->Orientation(seqI) == AbstractMatch::reverse && !(count == this->Length(seqI) && !column[seqI]) )
00670                         pos[seqI] = this->RightEnd(seqI) - count + 1;
00671         }
00672 }
00673 
00674 template< class BaseType >
00675 void CompactGappedAlignment<BaseType>::copyRange( CompactGappedAlignment& dest, gnSeqI left_column, gnSeqI length )
00676 {
00677         if( left_column + length > this->AlignmentLength() )
00678                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00679 //      if( length == 0 )
00680 //              return;
00681 
00682         // first copy the coordinates
00683         dest = CompactGappedAlignment(this->SeqCount(), length);
00684         for( uint i=0; i < this->SeqCount(); i++ ){
00685                 dest.SetStart(i, this->Start(i));
00686                 if( this->Orientation(i) != AbstractMatch::undefined )
00687                         dest.SetLength(this->Length(i), i);
00688         }
00689         // then trim the coordinates appropriately
00690 
00691         gnSeqI pre_alignlen = this->AlignmentLength();
00692         gnSeqI pre_lend0 = this->LeftEnd(0);
00693 
00694         std::vector< bitset_t > dest_mat(this->SeqCount(), bitset_t(length));
00695         std::vector<gnSeqI> pos;
00696         std::vector<bool> column;
00697         std::vector<gnSeqI> left_cc(this->SeqCount(), 0);
00698         if( left_column > 0 )
00699         {
00700                 this->GetColumn( left_column-1, pos, column );
00701                 for( uint i=0; i < this->SeqCount(); i++ ){
00702                         if( this->LeftEnd(i) == NO_MATCH )
00703                                 continue;
00704 
00705                         size_t char_count = this->Orientation(i) == AbstractMatch::forward ? pos[i] - this->LeftEnd(i) + 1 : this->RightEnd(i) - pos[i] + 1;
00706                         if( pos[i] > 0 && char_count > 0 )
00707                         {
00708                                 left_cc[i] = char_count;
00709                                 if( dest.Orientation(i) == AbstractMatch::forward )
00710                                         dest.SetStart(i, dest.Start(i) + char_count);
00711                         }else if( pos[i] == 0 && dest.Orientation(i) == AbstractMatch::reverse )
00712                         {
00713                                 // this sequence was completely obliterated by the crop
00714                                 dest.SetStart(i, NO_MATCH);
00715                         }
00716                 }
00717         }
00718 
00719 // now trim up the right side...
00720         gnSeqI right_trim = this->AlignmentLength() - left_column - length;
00721 
00722         if( right_trim > 0 )
00723         {
00724                 this->GetColumn( this->AlignmentLength()-right_trim, pos, column );
00725 
00726                 for( uint i=0; i < this->SeqCount(); i++ ){
00727                         if( this->LeftEnd(i) == NO_MATCH )
00728                                 continue;
00729                         AbstractMatch::orientation orient = this->Orientation(i);
00730                         if( pos[i] > 0 )
00731                         {
00732                                 gnSeqI char_count = pos[i] - (orient == AbstractMatch::forward ? (column[i] ? 1 : 0 ) : (column[i] ? 0 : 1 ) );
00733                                 char_count = orient == AbstractMatch::forward ? char_count - this->LeftEnd(i) + 1 : this->RightEnd(i) - char_count;
00734                                 char_count -= left_cc[i];
00735                                 gnSeqI deleted = this->Length(i) - char_count;
00736                                 if( dest.Start(i) < 0 )
00737                                         dest.SetStart(i, dest.Start(i)-deleted+left_cc[i]);     // fixme: is this off-by-one?
00738                         }else if( orient == AbstractMatch::forward ){
00739                                 dest.SetStart(i, NO_MATCH);
00740                         }
00741                 }
00742         }
00743 
00744         for( size_t i = 0; i < dest_mat.size(); ++i )
00745         {
00746                 size_t count = 0;
00747                 for( size_t j = 0; j < length; ++j )
00748                 {
00749                         if(align_matrix[i].test(j+left_column))
00750                         {
00751                                 dest_mat[i].set(j, true);
00752                                 ++count;
00753                         }
00754                 }
00755                 dest.SetLength(count, i);
00756                 if( count == 0 )
00757                         dest.SetStart(i, NO_MATCH);
00758         }
00759         dest.SetAlignment(dest_mat);
00760 
00761         dest.create_bitcount();
00762         if( !dest.validate() )
00763         {
00764                 std::cerr << "pre_lend0: " << pre_lend0 << std::endl;
00765                 std::cerr << "pre_alignlen: " << pre_alignlen << std::endl;
00766                 std::cerr << "CropStart error\n";
00767         }
00768 
00769 }
00770 
00771 template< class BaseType >
00772 void CompactGappedAlignment<BaseType>::CondenseGapColumns()
00773 {
00774         const size_t len = this->AlignmentLength();
00775         size_t d = 0;   // destination index
00776         for( size_t i = 0; i < len; ++i )
00777         {
00778                 size_t seqI = 0;
00779                 // check whether this is a gap col
00780                 for( ; seqI < align_matrix.size(); ++seqI )
00781                         if( this->LeftEnd(seqI) != 0 && align_matrix[seqI].test(i) )
00782                                 break;
00783 
00784                 // copy if not a gap col (and i != d )
00785                 if( seqI < align_matrix.size() )
00786                 {
00787                         if( i != d )
00788                         {
00789                                 for( seqI = 0; seqI < align_matrix.size(); ++seqI )
00790                                         align_matrix[seqI].set( d, align_matrix[seqI].test(i)  );
00791                         }
00792                         d++;
00793                 }
00794                 else
00795                         std::cout << "";
00796         }
00797         this->SetAlignmentLength(d);
00798         for( size_t seqI = 0; seqI < align_matrix.size(); ++seqI )
00799         {
00800                 align_matrix[seqI].resize(d);
00801                 align_matrix[seqI] = align_matrix[seqI];        // force reallocation on "optimized" windows builds
00802         }
00803         this->create_bitcount();
00804 }
00805 
00806 
00807 }
00808 
00809 namespace std {
00810 template<> inline
00811 void swap( mems::CompactGappedAlignment<>& a, mems::CompactGappedAlignment<>& b )
00812 {
00813         a.swap(b);
00814 }
00815 }
00816 
00817 
00818 #endif // __CompactGappedAlignment_h__
00819 

Generated on Fri Mar 14 06:01:02 2008 for libMems by doxygen 1.3.6