libMems/Islands.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: Islands.cpp,v 1.12 2004/04/19 23:11:19 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/Islands.h"
00014 #include "libMems/Aligner.h"
00015 #include "libMems/GappedAlignment.h"
00016 
00017 using namespace std;
00018 using namespace genome;
00019 namespace mems {
00020 
00025 void simpleFindIslands( IntervalList& iv_list, uint island_size, ostream& island_out ){
00026         vector< Island > island_list;
00027         simpleFindIslands( iv_list, island_size, island_list );
00028         for( size_t isleI = 0; isleI < island_list.size(); isleI++ )
00029         {
00030                 Island& i = island_list[isleI];
00031                 island_out << i.seqI << '\t' << i.leftI << '\t' << i.rightI << '\t' 
00032                                 << i.seqJ << '\t' << i.leftJ << '\t' << i.rightJ << endl;
00033         }
00034 }
00035 
00036 
00037 void simpleFindIslands( IntervalList& iv_list, uint island_size, vector< Island >& island_list ){
00038         if( iv_list.size() == 0 )
00039                 return;
00040         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00041                 Interval& iv = iv_list[ iv_listI ];
00042                 gnAlignedSequences gnas;
00043                 iv.GetAlignedSequences( gnas, iv_list.seq_table );
00044                 uint seq_count = iv_list.seq_table.size();
00045                 
00046                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00047                         uint seqJ;
00048                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00049                                 uint columnI = 0;
00050                                 gnSeqI curI = 0;
00051                                 gnSeqI curJ = 0;
00052                                 gnSeqI lastI = 0;
00053                                 gnSeqI lastJ = 0;
00054                                 for( columnI = 0; columnI < gnas.alignedSeqsSize(); columnI++ ){
00055                                         if( gnas.sequences[ seqI ][ columnI ] != '-' )
00056                                                 curI++;
00057                                         if( gnas.sequences[ seqJ ][ columnI ] != '-' )
00058                                                 curJ++;
00059                                         if( toupper( gnas.sequences[ seqI ][ columnI ] ) == 
00060                                                 toupper( gnas.sequences[ seqJ ][ columnI ] ) &&
00061                                                 gnas.sequences[ seqJ ][ columnI ] != '-' ){
00062                                                 // check for an island that was big enough
00063                                                 if( curI - lastI > island_size ||
00064                                                         curJ - lastJ > island_size ){
00065                                                         int64 leftI = iv.Start( seqI );
00066                                                         int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00067                                                         leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00068                                                         int64 leftJ = iv.Start( seqJ );
00069                                                         int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00070                                                         leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00071                                                         Island isle;
00072                                                         isle.seqI = seqI;
00073                                                         isle.seqJ = seqJ;
00074                                                         isle.leftI = leftI;
00075                                                         isle.leftJ = leftJ;
00076                                                         isle.rightI = rightI;
00077                                                         isle.rightJ = rightJ;
00078                                                         island_list.push_back(isle);
00079                                                 }
00080                                                 
00081                                                 lastI = curI;
00082                                                 lastJ = curJ;
00083                                         }
00084                                 }
00085                         }
00086                 }
00087         }
00088 }
00089 
00090 
00096 void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, vector< GappedAlignment >& backbone_regions ){
00097         if( iv_list.size() == 0 )
00098                 return;
00099         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00100                 Interval& iv = iv_list[ iv_listI ];
00101                 gnAlignedSequences gnas;
00102                 uint seqI;
00103                 uint seq_count = iv_list.seq_table.size();
00104                 vector< int64 > positions( seq_count );
00105                 vector< int64 > starts( seq_count );
00106                 vector< int64 > ends( seq_count );
00107                 vector< uint > gap_size( seq_count, 0 );
00108                 uint seqJ;
00109                 gnSeqI bb_start_col = 0;
00110                 gnSeqI bb_end_col = 0;
00111                 GappedAlignment cur_backbone( seq_count, 0 );
00112                 
00113                 // initialize positions and starts
00114                 for( seqI = 0; seqI < seq_count; seqI++ ){
00115                         positions[ seqI ] = iv_list[ iv_listI ].Start( seqI );
00116                         if( positions[ seqI ] < 0 )
00117                                 positions[ seqI ] -= iv_list[ iv_listI ].Length( seqI ) + 1;
00118                 }
00119                 starts = positions;
00120                 ends = positions;
00121 
00122                 iv.GetAlignedSequences( gnas, iv_list.seq_table );
00123                 bool backbone = true;   // assume we are starting out with a complete alignment column
00124                 uint columnI = 0;
00125                 vector< int64 > prev_positions;
00126                 for( ; columnI < gnas.alignedSeqsSize(); columnI++ ){
00127                         bool no_gaps = true;
00128                         prev_positions = positions;
00129                         for( seqI = 0; seqI < seq_count; seqI++ ){
00130                                 char cur_char = gnas.sequences[ seqI ][ columnI ];
00131                                 if( cur_char != '-' && toupper(cur_char) != 'N' ){
00132                                         if( gap_size[ seqI ] > max_gap_size && backbone ){
00133                                                 // end a stretch of backbone here only
00134                                                 // if the backbone meets size requirements in each
00135                                                 // sequence.
00136                                                 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00137                                                         if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
00138                                                                 break;
00139                                                         }
00140                                                 }
00141                                                 if( seqJ == seq_count ) {
00142                                                         // it's a legitimate stretch of backbone
00143                                                         backbone_regions.push_back( cur_backbone );
00144                                                         uint bbI = backbone_regions.size() - 1;
00145                                                         vector< string > aln_mat( seq_count );
00146                                                         for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00147                                                                 if( starts[ seqJ ] < 0 )
00148                                                                         backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
00149                                                                 else
00150                                                                         backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
00151                                                                 backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
00152                                                                 aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
00153                                                         }
00154                                                         backbone_regions[ bbI ].SetAlignment(aln_mat);
00155                                                         
00156                                                 }
00157                                                 // we either just finished backbone or a short area that didn't
00158                                                 // qualify as backbone
00159                                                 // look for a new backbone region
00160                                                 backbone = false;
00161                                         }
00162                                         positions[ seqI ]++;
00163                                         gap_size[ seqI ] = 0;
00164                                 }else{
00165                                         gap_size[ seqI ]++;
00166                                         no_gaps = false;
00167                                 }
00168                         }
00169                         if( no_gaps ){
00170                                 bb_end_col = columnI;
00171                                 ends = positions;
00172                                 if( !backbone ){
00173                                         starts = prev_positions;
00174                                         bb_start_col = columnI;
00175                                         backbone = true;
00176                                 }
00177                         }
00178                 }
00179 
00180                 // check for backbone one last time
00181                 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00182                         if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
00183                                 break;
00184                         }
00185                 }
00186                 if( seqJ == seq_count ) {
00187                         // it's a legitimate stretch of backbone
00188                         backbone_regions.push_back( cur_backbone );
00189                         uint bbI = backbone_regions.size() - 1;
00190                         vector< string > aln_mat( seq_count );
00191                         for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00192                                 if( starts[ seqJ ] < 0 )
00193                                         backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
00194                                 else
00195                                         backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
00196                                 backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
00197                                 aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
00198                         }
00199                         backbone_regions[ bbI ].SetAlignment( aln_mat );
00200                 }
00201         }
00202 }
00203 
00204 
00205 void outputBackbone( const vector< GappedAlignment >& backbone_regions, ostream& backbone_out ){
00206         for( uint bbI = 0; bbI < backbone_regions.size(); bbI++ ){
00207                 for( uint seqJ = 0; seqJ < backbone_regions[ bbI ].SeqCount(); seqJ++ ){
00208                         if( seqJ > 0 )
00209                                 backbone_out << '\t';
00210                         int64 bb_rend = backbone_regions[ bbI ].Start( seqJ );
00211                         if( backbone_regions[ bbI ].Start( seqJ ) < 0 )
00212                                 bb_rend -= (int64)backbone_regions[ bbI ].Length( seqJ );
00213                         else
00214                                 bb_rend += (int64)backbone_regions[ bbI ].Length( seqJ );
00215                         backbone_out << backbone_regions[ bbI ].Start( seqJ ) << '\t' <<  bb_rend;
00216                 }
00217                 backbone_out << endl;
00218         }
00219 }
00220 
00221 
00222 // always return the left end of the one to the left and the right of the one to the right
00223 
00224 void getGapBounds( vector<gnSeqI>& seq_lengths, vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start ){
00225         if( rightI != -1 )
00226                 right_start = absolut( adjacencies[ rightI ].left_end[ seqJ ] );
00227         else
00228                 right_start = seq_lengths[seqJ] + 1;
00229         
00230         if( leftI != -1 )
00231                 left_start = absolut( adjacencies[ leftI ].right_end[ seqJ ] );
00232         else
00233                 left_start = 1;
00234 }
00235 
00236 
00237 void addUnalignedIntervals( IntervalList& iv_list, set< uint > seq_set, vector<gnSeqI> seq_lengths ){
00238         vector< LCB > adjacencies;
00239         vector< int64 > weights;
00240         uint lcbI;
00241         uint seqI;
00242         if( seq_lengths.size() == 0 )
00243                 for( seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
00244                         seq_lengths.push_back(iv_list.seq_table[seqI]->length());
00245 
00246         uint seq_count = seq_lengths.size();
00247 
00248 
00249         if( seq_set.size() == 0 )
00250         {
00251                 // if an empty seq set was passed then assume all seqs
00252                 // should be processed
00253                 for( seqI = 0; seqI < seq_count; seqI++ )
00254                         seq_set.insert( seqI );
00255         }
00256         
00257         weights = vector< int64 >( iv_list.size(), 0 );
00258         computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
00259 
00260         vector< int > rightmost;
00261         for( seqI = 0; seqI < seq_count; seqI++ ){
00262                 rightmost.push_back( -1 );
00263         }
00264         for( lcbI = 0; lcbI <= adjacencies.size(); lcbI++ ){
00265                 set< uint >::iterator seq_set_iterator = seq_set.begin();
00266                 for( ; seq_set_iterator != seq_set.end(); seq_set_iterator++ ){
00267                         seqI = *seq_set_iterator;
00268                         // scan left
00269                         int leftI;
00270                         if( lcbI < adjacencies.size() ){
00271 // left is always to the left!!
00272                                 leftI = adjacencies[ lcbI ].left_adjacency[ seqI ];
00273                         }else
00274                                 leftI = rightmost[ seqI ];
00275 
00276                         int rightI = lcbI < adjacencies.size() ? lcbI : -1;
00277 // right is always to the right!!
00278                         if( lcbI < adjacencies.size() )
00279                                 if( adjacencies[ lcbI ].right_adjacency[ seqI ] == -1 )
00280                                         rightmost[ seqI ] = lcbI;
00281                         
00282                         int64 left_start, right_start;
00283                         getGapBounds( seq_lengths, adjacencies, seqI, leftI, rightI, left_start, right_start );
00284                         int64 gap_len =  absolut( right_start ) - absolut( left_start );
00285                         if( gap_len > 0 ){
00286                                 Match mm( seq_count );
00287                                 Match* m = mm.Copy();
00288                                 for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
00289                                         m->SetStart( seqJ, 0 );
00290                                 }
00291                                 m->SetStart( seqI, left_start );
00292                                 m->SetLength( gap_len );
00293                                 vector<AbstractMatch*> tmp(1, m);
00294                                 iv_list.push_back( Interval(tmp.begin(), tmp.end()) );
00295                                 m->Free();
00296                         }
00297                 }
00298         }
00299 }
00300 
00301 
00302 void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, ostream& island_out ){
00303         IntervalList iv_list_tmp = iv_list;
00304         addUnalignedIntervals( iv_list_tmp );
00305         uint seq_count = iv_list.seq_table.size();
00306         
00307         for( int ivI = iv_list.size(); ivI < iv_list_tmp.size(); ivI++ ){
00308                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00309                         if( iv_list_tmp[ ivI ].Length( seqI ) < island_size )
00310                                 continue;
00311 
00312                         // this is an island, write the LCB island out
00313                         gnSeqI left_end = absolut( iv_list_tmp[ ivI ].Start( seqI ) );
00314                         gnSeqI right_end = left_end + iv_list_tmp[ ivI ].Length( seqI ) - 1;
00315                         island_out << "LCB island:\t" << seqI << '\t' << left_end << '\t' << right_end << endl;
00316                 }
00317         }
00318 }
00319 
00320 }

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6