src/bbBreakOnGenes.cpp

Go to the documentation of this file.
00001 #include "libMems/Backbone.h"
00002 #include "libMems/ProgressiveAligner.h"
00003 #include <sstream>
00004 using namespace mems;
00005 using namespace std;
00006 using namespace genome;
00007 
00008 
00009 template< typename MatchVector >
00010 void getBpList( MatchVector& mvect, uint seq, vector< gnSeqI >& bp_list )
00011 {
00012         bp_list.clear();
00013         for( size_t ivI = 0; ivI < mvect.size(); ivI++ )
00014         {
00015                 if( mvect[ivI]->LeftEnd(seq) == NO_MATCH )
00016                         continue;
00017                 bp_list.push_back( mvect[ivI]->LeftEnd(seq) );
00018                 bp_list.push_back( mvect[ivI]->RightEnd(seq)+1 );
00019         }
00020         std::sort( bp_list.begin(), bp_list.end() );
00021 }
00022 
00023 template< typename MatchVector >
00024 void createMap( const MatchVector& mv_from, const MatchVector& mv_to, vector< size_t >& map )
00025 {
00026         typedef typename MatchVector::value_type MatchPtr;
00027         vector< pair< MatchPtr, size_t > > m1(mv_from.size());
00028         vector< pair< MatchPtr, size_t > > m2(mv_to.size());
00029         for( size_t i = 0; i < mv_from.size(); ++i )
00030                 m1[i] = make_pair( mv_from[i], i );
00031         for( size_t i = 0; i < mv_to.size(); ++i )
00032                 m2[i] = make_pair( mv_to[i], i );
00033         std::sort( m1.begin(), m1.end() );
00034         std::sort( m2.begin(), m2.end() );
00035         map.resize( m1.size() );
00036         for( size_t i = 0; i < m1.size(); ++i )
00037                 map[m1[i].second] = m2[i].second;
00038 }
00039 
00040 
00041 void makeAllPairwiseGenomeHSSBreakOnGenes( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_ptrs, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, const HssDetector* detector, vector< vector< gnSeqI > >& gene_bounds )
00042 {
00043         uint seq_count = iv_list.seq_table.size();
00044         // make pairwise projections of intervals and find LCBs...
00045         for( size_t seqI = 0; seqI < seq_count; ++seqI )
00046         {
00047                 for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00048                 {
00049                         vector< uint > projection;
00050                         projection.push_back( seqI );
00051                         projection.push_back( seqJ );
00052                         vector< vector< MatchProjectionAdapter* > > LCB_list;
00053                         vector< LCB > projected_adjs;
00054                         projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
00055                         // make intervals
00056                         IntervalList pair_ivs;
00057                         pair_ivs.seq_table.push_back( iv_list.seq_table[seqI] );
00058                         pair_ivs.seq_table.push_back( iv_list.seq_table[seqJ] );
00059                         pair_ivs.resize( LCB_list.size() );
00060                         for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
00061                                 pair_ivs[lcbI].SetMatches( LCB_list[lcbI] );
00062                         LCB_list.clear();
00063 
00064                         vector< CompactGappedAlignment<>* > pair_cgas( pair_ivs.size() );
00065                         for( size_t lcbI = 0; lcbI < pair_ivs.size(); ++lcbI )
00066                         {
00067                                 CompactGappedAlignment<> tmp_cga;
00068                                 pair_cgas[lcbI] = tmp_cga.Copy();
00069                                 new (pair_cgas[lcbI])CompactGappedAlignment<>( pair_ivs[lcbI] );
00070                         }
00071 
00072                         vector< CompactGappedAlignment<>* > hss_list;
00073                         // now find islands
00074                         hss_array_t hss_array;
00075                         (*detector)( pair_cgas, pair_ivs.seq_table, hss_array );
00076                         HssArrayToCga(pair_cgas, pair_ivs.seq_table, hss_array, hss_list);
00077 
00078                         for( size_t cgaI = 0; cgaI < pair_cgas.size(); ++cgaI )
00079                                 pair_cgas[cgaI]->Free();
00080                         pair_cgas.clear();
00081 
00082                         // now split up on iv boundaries
00083                         vector< gnSeqI > bp_list;
00084                         getBpList( iv_ptrs, seqI, bp_list );
00085                         GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(0);
00086                         SingleStartComparator< CompactGappedAlignment<> > ssc(0);
00087                         std::sort(hss_list.begin(), hss_list.end(), ssc );
00088                         applyBreakpoints( bp_list, hss_list, gmsm );
00089                         // break on gene bounds in seqI
00090                         std::sort(hss_list.begin(), hss_list.end(), ssc );
00091 //                      if( !(seqI == 1 && seqJ == 15 ) )
00092                         applyBreakpoints( gene_bounds[seqI], hss_list, gmsm );
00093                         // and again on seqJ
00094                         getBpList( iv_ptrs, seqJ, bp_list );
00095                         GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm1(1);
00096                         SingleStartComparator< CompactGappedAlignment<> > ssc1(1);
00097                         std::sort(hss_list.begin(), hss_list.end(), ssc1 );
00098                         applyBreakpoints( bp_list, hss_list, gmsm1 );
00099                         // break on gene bounds in seqJ
00100                         std::sort(hss_list.begin(), hss_list.end(), ssc1 );
00101 //                      if( !(seqI == 1 && seqJ == 15 ) )
00102                         applyBreakpoints( gene_bounds[seqJ], hss_list, gmsm1 );
00103 
00104                         // now transform into interval-specific columns
00105                         std::sort(hss_list.begin(), hss_list.end(), ssc );
00106 
00107                         SingleStartComparator< CompactGappedAlignment<> > ivcomp(seqI);
00108                         std::sort( iv_ptrs.begin(), iv_ptrs.end(), ivcomp );
00109                         vector< size_t > iv_map;
00110                         createMap( iv_ptrs, iv_orig_ptrs, iv_map );
00111                         size_t ivI = 0;
00112                         while( ivI < iv_ptrs.size() && iv_ptrs[ivI]->LeftEnd(0) == NO_MATCH )
00113                                 ++ivI;
00114                         for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00115                         {
00116                                 if( hss_list[hssI]->LeftEnd(0) == NO_MATCH || hss_list[hssI]->Length(0) == 0 )
00117                                         continue;
00118                                 if( ivI == iv_ptrs.size() )
00119                                 {
00120                                         cerr << "huh?\n";
00121                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00122                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00123                                         cerr << iv_ptrs.back()->LeftEnd(seqI) << endl;
00124                                         cerr << iv_ptrs.back()->RightEnd(seqI) << endl;
00125                                 }
00126                                 while( ivI < iv_ptrs.size() && 
00127                                         (iv_ptrs[ivI]->LeftEnd(seqI) == NO_MATCH ||
00128                                         hss_list[hssI]->LeftEnd(0) > iv_ptrs[ivI]->RightEnd(seqI) ) )
00129                                         ++ivI;
00130                                 if( ivI == iv_ptrs.size() )
00131                                 {
00132                                         cerr << "hssI fit!!\n";
00133                                         genome::breakHere();
00134                                 }
00135                                 // check for containment in seqJ
00136                                 if( iv_ptrs[ivI]->LeftEnd(seqJ) == NO_MATCH ||
00137                                         iv_ptrs[ivI]->RightEnd(seqJ) < hss_list[hssI]->LeftEnd(1) ||
00138                                         hss_list[hssI]->RightEnd(1) < iv_ptrs[ivI]->LeftEnd(seqJ) )
00139                                         continue;       // this hss falls to an invalid range in seqJ
00140 
00141                                 if( hss_list[hssI]->RightEnd(0) < iv_ptrs[ivI]->LeftEnd(seqI) )
00142                                 {
00143                                         cerr << "huh 2?\n";
00144                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00145                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00146                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00147                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00148                                         hssI++;
00149                                         continue;
00150                                 }
00151 
00152                                 vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][iv_map[ivI]];
00153 
00154                                 gnSeqI left_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->LeftEnd(0) );
00155                                 gnSeqI right_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->RightEnd(0) );
00156                                 if(left_col > right_col && iv_ptrs[ivI]->Orientation(seqI) == AbstractMatch::reverse )
00157                                 {
00158                                         swap(left_col, right_col);      // must have been a revcomp seq
00159                                 }
00160                                 else if(left_col > right_col)
00161                                 {
00162                                         cerr << "bad cols\n";
00163                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00164                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00165                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00166                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00167                                         genome::breakHere();
00168                                 }
00169 
00170                                 if( left_col > 2000000000 || right_col > 2000000000 )
00171                                 {
00172                                         cerr << "huh 2?\n";
00173                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00174                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00175                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00176                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00177                                         genome::breakHere();
00178                                 }
00179                                 cur_hss_cols.push_back( make_pair( left_col, right_col ) );
00180                         }
00181                         for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00182                                 hss_list[hssI]->Free();
00183                 }
00184         }
00185 }
00186 
00187 
00188 class IntervalSeqManipulator
00189 {
00190 public:
00191         IntervalSeqManipulator( uint seq ) : m_seq(seq) {}
00192         gnSeqI LeftEnd(Interval& m) const{ return m.LeftEnd(m_seq); }
00193         gnSeqI Length(Interval& m) const{ return m.Length(m_seq); }
00194         void CropLeft(Interval& m, gnSeqI amount ) const{ m.CropLeft(amount, m_seq); }
00195         void CropRight(Interval& m, gnSeqI amount ) const{ m.CropRight(amount, m_seq); }
00196         template< typename ContainerType >
00197         void AddCopy(ContainerType& c, Interval& m) const{ c.push_back( m ); }
00198 private:
00199         uint m_seq;
00200 };
00201 
00202 
00203 void detectBackboneBreakOnGenes( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, vector< vector< gnSeqI > >& gene_bounds )
00204 {
00205         uint seq_count = iv_list.seq_table.size();
00206 
00207         // indexed by seqI, seqJ, ivI, hssI (left col, right col)
00208         pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][iv_list.size()]);
00209 
00210         // ugg.  need CompactGappedAlignment for its SeqPosToColumn
00211         vector< CompactGappedAlignment<>* > iv_ptrs(iv_list.size());
00212         for( size_t i = 0; i < iv_list.size(); ++i )
00213         {
00214                 CompactGappedAlignment<> tmp_cga;
00215                 iv_ptrs[i] = tmp_cga.Copy();
00216                 new (iv_ptrs[i])CompactGappedAlignment<>( iv_list[i] );
00217         }
00218 
00219         iv_orig_ptrs = iv_ptrs;
00220         makeAllPairwiseGenomeHSSBreakOnGenes( iv_list, iv_ptrs, iv_orig_ptrs, hss_cols, detector, gene_bounds );
00221 
00222         // merge overlapping pairwise homology predictions into n-way predictions
00223         mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, bb_list );
00224 }
00225 
00226 int main( int argc, char* argv[] )
00227 {
00228 #if     WIN32
00229         SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
00230 #endif
00231 
00232         if( argc < 4 )
00233         {
00234                 cerr << "bbBreakOnGenes <xmfa file> <min bb gap size> <bb output>\n";
00235                 return -1;
00236         }
00237         string xmfa_fname( argv[1] );
00238         int min_bb_gap = atoi( argv[2] );
00239         string output_fname( argv[3] );
00240 
00241         ifstream xmfa_input( xmfa_fname.c_str() );
00242         if( !xmfa_input.is_open() ){
00243                 cerr << "Error opening \"" << xmfa_fname << "\"" << endl;
00244                 return -4;
00245         }
00246         ofstream bb_output( output_fname.c_str() );
00247         if( !bb_output.is_open() ){
00248                 cerr << "Error opening \"" << output_fname << "\" for writing" << endl;
00249                 return -6;
00250         }
00251 
00252 
00253         // read the alignment
00254         IntervalList iv_list;
00255         iv_list.ReadStandardAlignment( xmfa_input );
00256         LoadSequences(iv_list, &cout);
00257         vector< vector< gnSeqI > > gene_bounds( iv_list.seq_table.size() );
00258 
00259         if( argc - 4 == iv_list.seq_filename.size() )
00260         {
00261                 cerr << "Reading gene coordinates from .ptt files\n";
00262                 // read ptt files instead
00263                 for( size_t aI = 0; aI < iv_list.seq_filename.size(); aI++ )
00264                 {
00265                         ifstream ptt_in( argv[aI+4] );
00266                         string bub;
00267                         getline( ptt_in, bub );
00268                         getline( ptt_in, bub );
00269                         getline( ptt_in, bub );
00270                         while( getline( ptt_in, bub ) )
00271                         {
00272                                 stringstream line_str(bub);
00273                                 string buf;
00274                                 getline( line_str, buf, '.' );
00275                                 int64 lend = atoi(buf.c_str());
00276                                 getline( line_str, buf, '.' );
00277                                 getline( line_str, buf );
00278                                 int64 rend = atoi(buf.c_str());
00279                                 gene_bounds[aI].push_back( lend -1);
00280                                 gene_bounds[aI].push_back( lend );
00281                                 gene_bounds[aI].push_back( rend );
00282                                 gene_bounds[aI].push_back( rend+1 );
00283         
00284                         }
00285                 }
00286         }else{
00287 
00288                 // get gene boundary coordinates, break bb segs on genes...
00289                 for( size_t genomeI = 0; genomeI < iv_list.seq_table.size(); genomeI++ )
00290                 {
00291                         for( size_t featureI = 0; featureI < iv_list.seq_table[genomeI]->getFeatureListLength(); ++featureI )
00292                         {
00293                                 gnBaseFeature* feat = iv_list.seq_table[genomeI]->getFeature( featureI );
00294                                 string feat_name = feat->GetName();
00295                                 if( feat_name != "CDS" )
00296                                         continue;       // don't deal with other feature types (source, misc_RNA, etc)
00297                                 gnLocation loc = feat->GetLocation(0);
00298                                 if( loc.GetFirst() > loc.GetLast() || loc.GetFirst() == 0 || loc.GetLast() == 0 )
00299                                         continue;       // a problem parsing annotation?
00300                                 gene_bounds[genomeI].push_back( loc.GetFirst() );
00301                                 gene_bounds[genomeI].push_back( loc.GetLast() +1 );
00302                         }
00303 //                      IntervalSeqManipulator ism(genomeI);
00304                         std::sort( gene_bounds[genomeI].begin(), gene_bounds[genomeI].end() );
00305                         cerr << "Found " << gene_bounds[genomeI].size() / 2 << " genes for " << iv_list.seq_filename[genomeI] << endl;
00306                 }
00307         }
00308 
00309         // detect big gaps
00310         backbone_list_t bb_list;
00311         vector< CompactGappedAlignment<>* > iv_orig_ptrs;
00312         BigGapsDetector bgd( min_bb_gap );
00313         detectBackboneBreakOnGenes( iv_list, bb_list, &bgd, iv_orig_ptrs, gene_bounds );
00314 
00315         writeBackboneSeqCoordinates( bb_list, iv_list, bb_output );
00316         std::vector< bb_seqentry_t > bb_seq_list;
00317         bb_output.close();
00318         std::ifstream bbseq_input( output_fname.c_str() );
00319         readBackboneSeqFile( bbseq_input, bb_seq_list );
00320 
00321         // testing:  check whether any gene boundaries are violated
00322         gene_bounds[0].push_back(31337);        // test the test:
00323         gene_bounds[0].push_back(31333);        // insert some bogus gene bounds to make sure
00324         gene_bounds[0].push_back(31341);        // they get found and reported
00325         gene_bounds[0].push_back(31345);
00326         for( uint seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
00327         {
00328                 cerr << "Checking seq " << seqI << " for errors\n";
00329                 std::sort( gene_bounds[seqI].begin(), gene_bounds[seqI].end() );
00330                 BbSeqEntrySorter bs(seqI);
00331                 std::sort( bb_seq_list.begin(), bb_seq_list.end(), bs );
00332                 size_t gI = 0;
00333                 size_t bI = 0;
00334                 cerr << gene_bounds[seqI].size() << " gene boundaries and " << bb_seq_list.size() << " bb segs\n";
00335                 for( ; gI < gene_bounds[seqI].size() && bI < bb_seq_list.size(); gI++ )
00336                 {
00337                         cout << "checking " << bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << endl;  
00338                         while( bI < bb_seq_list.size() && gene_bounds[seqI][gI] > abs(bb_seq_list[bI][seqI].second) )
00339                                 bI++;
00340                         if( bI == bb_seq_list.size() )
00341                                 break;
00342                         if(abs(bb_seq_list[bI][seqI].first) + 1 < gene_bounds[seqI][gI] && gene_bounds[seqI][gI] < abs(bb_seq_list[bI][seqI].second) - 1)
00343                         {
00344                                 cerr << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " violates gene boundary " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;  
00345                         }else
00346                                 cout << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " is okay for " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;  
00347                 }
00348         }
00349 
00350 //      mergeAdjacentSegments( bb_seq_list );
00351 //      addUniqueSegments( bb_seq_list );
00352         bbseq_input.close();
00353         bb_output.open(output_fname.c_str());
00354         writeBackboneSeqFile( bb_output, bb_seq_list );
00355 
00356         return 0;
00357 }
00358 

Generated on Mon Aug 19 06:00:40 2013 for mauveAligner by doxygen 1.3.6