00001 #include "libMems/Backbone.h"
00002 #include "libMems/ProgressiveAligner.h"
00003 #include <sstream>
00004 using namespace mems;
00005 using namespace std;
00006 using namespace genome;
00007
00008
00009 template< typename MatchVector >
00010 void getBpList( MatchVector& mvect, uint seq, vector< gnSeqI >& bp_list )
00011 {
00012 bp_list.clear();
00013 for( size_t ivI = 0; ivI < mvect.size(); ivI++ )
00014 {
00015 if( mvect[ivI]->LeftEnd(seq) == NO_MATCH )
00016 continue;
00017 bp_list.push_back( mvect[ivI]->LeftEnd(seq) );
00018 bp_list.push_back( mvect[ivI]->RightEnd(seq)+1 );
00019 }
00020 std::sort( bp_list.begin(), bp_list.end() );
00021 }
00022
00023 template< typename MatchVector >
00024 void createMap( const MatchVector& mv_from, const MatchVector& mv_to, vector< size_t >& map )
00025 {
00026 typedef typename MatchVector::value_type MatchPtr;
00027 vector< pair< MatchPtr, size_t > > m1(mv_from.size());
00028 vector< pair< MatchPtr, size_t > > m2(mv_to.size());
00029 for( size_t i = 0; i < mv_from.size(); ++i )
00030 m1[i] = make_pair( mv_from[i], i );
00031 for( size_t i = 0; i < mv_to.size(); ++i )
00032 m2[i] = make_pair( mv_to[i], i );
00033 std::sort( m1.begin(), m1.end() );
00034 std::sort( m2.begin(), m2.end() );
00035 map.resize( m1.size() );
00036 for( size_t i = 0; i < m1.size(); ++i )
00037 map[m1[i].second] = m2[i].second;
00038 }
00039
00040
00041 void makeAllPairwiseGenomeHSSBreakOnGenes( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_ptrs, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, const HssDetector* detector, vector< vector< gnSeqI > >& gene_bounds )
00042 {
00043 uint seq_count = iv_list.seq_table.size();
00044
00045 for( size_t seqI = 0; seqI < seq_count; ++seqI )
00046 {
00047 for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00048 {
00049 vector< uint > projection;
00050 projection.push_back( seqI );
00051 projection.push_back( seqJ );
00052 vector< vector< MatchProjectionAdapter* > > LCB_list;
00053 vector< LCB > projected_adjs;
00054 projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
00055
00056 IntervalList pair_ivs;
00057 pair_ivs.seq_table.push_back( iv_list.seq_table[seqI] );
00058 pair_ivs.seq_table.push_back( iv_list.seq_table[seqJ] );
00059 pair_ivs.resize( LCB_list.size() );
00060 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
00061 pair_ivs[lcbI].SetMatches( LCB_list[lcbI] );
00062 LCB_list.clear();
00063
00064 vector< CompactGappedAlignment<>* > pair_cgas( pair_ivs.size() );
00065 for( size_t lcbI = 0; lcbI < pair_ivs.size(); ++lcbI )
00066 {
00067 CompactGappedAlignment<> tmp_cga;
00068 pair_cgas[lcbI] = tmp_cga.Copy();
00069 new (pair_cgas[lcbI])CompactGappedAlignment<>( pair_ivs[lcbI] );
00070 }
00071
00072 vector< CompactGappedAlignment<>* > hss_list;
00073
00074 hss_array_t hss_array;
00075 (*detector)( pair_cgas, pair_ivs.seq_table, hss_array );
00076 HssArrayToCga(pair_cgas, pair_ivs.seq_table, hss_array, hss_list);
00077
00078 for( size_t cgaI = 0; cgaI < pair_cgas.size(); ++cgaI )
00079 pair_cgas[cgaI]->Free();
00080 pair_cgas.clear();
00081
00082
00083 vector< gnSeqI > bp_list;
00084 getBpList( iv_ptrs, seqI, bp_list );
00085 GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(0);
00086 SingleStartComparator< CompactGappedAlignment<> > ssc(0);
00087 std::sort(hss_list.begin(), hss_list.end(), ssc );
00088 applyBreakpoints( bp_list, hss_list, gmsm );
00089
00090 std::sort(hss_list.begin(), hss_list.end(), ssc );
00091
00092 applyBreakpoints( gene_bounds[seqI], hss_list, gmsm );
00093
00094 getBpList( iv_ptrs, seqJ, bp_list );
00095 GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm1(1);
00096 SingleStartComparator< CompactGappedAlignment<> > ssc1(1);
00097 std::sort(hss_list.begin(), hss_list.end(), ssc1 );
00098 applyBreakpoints( bp_list, hss_list, gmsm1 );
00099
00100 std::sort(hss_list.begin(), hss_list.end(), ssc1 );
00101
00102 applyBreakpoints( gene_bounds[seqJ], hss_list, gmsm1 );
00103
00104
00105 std::sort(hss_list.begin(), hss_list.end(), ssc );
00106
00107 SingleStartComparator< CompactGappedAlignment<> > ivcomp(seqI);
00108 std::sort( iv_ptrs.begin(), iv_ptrs.end(), ivcomp );
00109 vector< size_t > iv_map;
00110 createMap( iv_ptrs, iv_orig_ptrs, iv_map );
00111 size_t ivI = 0;
00112 while( ivI < iv_ptrs.size() && iv_ptrs[ivI]->LeftEnd(0) == NO_MATCH )
00113 ++ivI;
00114 for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00115 {
00116 if( hss_list[hssI]->LeftEnd(0) == NO_MATCH || hss_list[hssI]->Length(0) == 0 )
00117 continue;
00118 if( ivI == iv_ptrs.size() )
00119 {
00120 cerr << "huh?\n";
00121 cerr << hss_list[hssI]->LeftEnd(0) << endl;
00122 cerr << hss_list[hssI]->RightEnd(0) << endl;
00123 cerr << iv_ptrs.back()->LeftEnd(seqI) << endl;
00124 cerr << iv_ptrs.back()->RightEnd(seqI) << endl;
00125 }
00126 while( ivI < iv_ptrs.size() &&
00127 (iv_ptrs[ivI]->LeftEnd(seqI) == NO_MATCH ||
00128 hss_list[hssI]->LeftEnd(0) > iv_ptrs[ivI]->RightEnd(seqI) ) )
00129 ++ivI;
00130 if( ivI == iv_ptrs.size() )
00131 {
00132 cerr << "hssI fit!!\n";
00133 genome::breakHere();
00134 }
00135
00136 if( iv_ptrs[ivI]->LeftEnd(seqJ) == NO_MATCH ||
00137 iv_ptrs[ivI]->RightEnd(seqJ) < hss_list[hssI]->LeftEnd(1) ||
00138 hss_list[hssI]->RightEnd(1) < iv_ptrs[ivI]->LeftEnd(seqJ) )
00139 continue;
00140
00141 if( hss_list[hssI]->RightEnd(0) < iv_ptrs[ivI]->LeftEnd(seqI) )
00142 {
00143 cerr << "huh 2?\n";
00144 cerr << hss_list[hssI]->LeftEnd(0) << endl;
00145 cerr << hss_list[hssI]->RightEnd(0) << endl;
00146 cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00147 cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00148 hssI++;
00149 continue;
00150 }
00151
00152 vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][iv_map[ivI]];
00153
00154 gnSeqI left_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->LeftEnd(0) );
00155 gnSeqI right_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->RightEnd(0) );
00156 if(left_col > right_col && iv_ptrs[ivI]->Orientation(seqI) == AbstractMatch::reverse )
00157 {
00158 swap(left_col, right_col);
00159 }
00160 else if(left_col > right_col)
00161 {
00162 cerr << "bad cols\n";
00163 cerr << hss_list[hssI]->LeftEnd(0) << endl;
00164 cerr << hss_list[hssI]->RightEnd(0) << endl;
00165 cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00166 cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00167 genome::breakHere();
00168 }
00169
00170 if( left_col > 2000000000 || right_col > 2000000000 )
00171 {
00172 cerr << "huh 2?\n";
00173 cerr << hss_list[hssI]->LeftEnd(0) << endl;
00174 cerr << hss_list[hssI]->RightEnd(0) << endl;
00175 cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00176 cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00177 genome::breakHere();
00178 }
00179 cur_hss_cols.push_back( make_pair( left_col, right_col ) );
00180 }
00181 for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00182 hss_list[hssI]->Free();
00183 }
00184 }
00185 }
00186
00187
00188 class IntervalSeqManipulator
00189 {
00190 public:
00191 IntervalSeqManipulator( uint seq ) : m_seq(seq) {}
00192 gnSeqI LeftEnd(Interval& m) const{ return m.LeftEnd(m_seq); }
00193 gnSeqI Length(Interval& m) const{ return m.Length(m_seq); }
00194 void CropLeft(Interval& m, gnSeqI amount ) const{ m.CropLeft(amount, m_seq); }
00195 void CropRight(Interval& m, gnSeqI amount ) const{ m.CropRight(amount, m_seq); }
00196 template< typename ContainerType >
00197 void AddCopy(ContainerType& c, Interval& m) const{ c.push_back( m ); }
00198 private:
00199 uint m_seq;
00200 };
00201
00202
00203 void detectBackboneBreakOnGenes( IntervalList& iv_list, backbone_list_t& bb_list, const HssDetector* detector, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, vector< vector< gnSeqI > >& gene_bounds )
00204 {
00205 uint seq_count = iv_list.seq_table.size();
00206
00207
00208 pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][iv_list.size()]);
00209
00210
00211 vector< CompactGappedAlignment<>* > iv_ptrs(iv_list.size());
00212 for( size_t i = 0; i < iv_list.size(); ++i )
00213 {
00214 CompactGappedAlignment<> tmp_cga;
00215 iv_ptrs[i] = tmp_cga.Copy();
00216 new (iv_ptrs[i])CompactGappedAlignment<>( iv_list[i] );
00217 }
00218
00219 iv_orig_ptrs = iv_ptrs;
00220 makeAllPairwiseGenomeHSSBreakOnGenes( iv_list, iv_ptrs, iv_orig_ptrs, hss_cols, detector, gene_bounds );
00221
00222
00223 mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, bb_list );
00224 }
00225
00226 int main( int argc, char* argv[] )
00227 {
00228 #if WIN32
00229 SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS);
00230 #endif
00231
00232 if( argc < 4 )
00233 {
00234 cerr << "bbBreakOnGenes <xmfa file> <min bb gap size> <bb output>\n";
00235 return -1;
00236 }
00237 string xmfa_fname( argv[1] );
00238 int min_bb_gap = atoi( argv[2] );
00239 string output_fname( argv[3] );
00240
00241 ifstream xmfa_input( xmfa_fname.c_str() );
00242 if( !xmfa_input.is_open() ){
00243 cerr << "Error opening \"" << xmfa_fname << "\"" << endl;
00244 return -4;
00245 }
00246 ofstream bb_output( output_fname.c_str() );
00247 if( !bb_output.is_open() ){
00248 cerr << "Error opening \"" << output_fname << "\" for writing" << endl;
00249 return -6;
00250 }
00251
00252
00253
00254 IntervalList iv_list;
00255 iv_list.ReadStandardAlignment( xmfa_input );
00256 LoadSequences(iv_list, &cout);
00257 vector< vector< gnSeqI > > gene_bounds( iv_list.seq_table.size() );
00258
00259 if( argc - 4 == iv_list.seq_filename.size() )
00260 {
00261 cerr << "Reading gene coordinates from .ptt files\n";
00262
00263 for( size_t aI = 0; aI < iv_list.seq_filename.size(); aI++ )
00264 {
00265 ifstream ptt_in( argv[aI+4] );
00266 string bub;
00267 getline( ptt_in, bub );
00268 getline( ptt_in, bub );
00269 getline( ptt_in, bub );
00270 while( getline( ptt_in, bub ) )
00271 {
00272 stringstream line_str(bub);
00273 string buf;
00274 getline( line_str, buf, '.' );
00275 int64 lend = atoi(buf.c_str());
00276 getline( line_str, buf, '.' );
00277 getline( line_str, buf );
00278 int64 rend = atoi(buf.c_str());
00279 gene_bounds[aI].push_back( lend -1);
00280 gene_bounds[aI].push_back( lend );
00281 gene_bounds[aI].push_back( rend );
00282 gene_bounds[aI].push_back( rend+1 );
00283
00284 }
00285 }
00286 }else{
00287
00288
00289 for( size_t genomeI = 0; genomeI < iv_list.seq_table.size(); genomeI++ )
00290 {
00291 for( size_t featureI = 0; featureI < iv_list.seq_table[genomeI]->getFeatureListLength(); ++featureI )
00292 {
00293 gnBaseFeature* feat = iv_list.seq_table[genomeI]->getFeature( featureI );
00294 string feat_name = feat->GetName();
00295 if( feat_name != "CDS" )
00296 continue;
00297 gnLocation loc = feat->GetLocation(0);
00298 if( loc.GetFirst() > loc.GetLast() || loc.GetFirst() == 0 || loc.GetLast() == 0 )
00299 continue;
00300 gene_bounds[genomeI].push_back( loc.GetFirst() );
00301 gene_bounds[genomeI].push_back( loc.GetLast() +1 );
00302 }
00303
00304 std::sort( gene_bounds[genomeI].begin(), gene_bounds[genomeI].end() );
00305 cerr << "Found " << gene_bounds[genomeI].size() / 2 << " genes for " << iv_list.seq_filename[genomeI] << endl;
00306 }
00307 }
00308
00309
00310 backbone_list_t bb_list;
00311 vector< CompactGappedAlignment<>* > iv_orig_ptrs;
00312 BigGapsDetector bgd( min_bb_gap );
00313 detectBackboneBreakOnGenes( iv_list, bb_list, &bgd, iv_orig_ptrs, gene_bounds );
00314
00315 writeBackboneSeqCoordinates( bb_list, iv_list, bb_output );
00316 std::vector< bb_seqentry_t > bb_seq_list;
00317 bb_output.close();
00318 std::ifstream bbseq_input( output_fname.c_str() );
00319 readBackboneSeqFile( bbseq_input, bb_seq_list );
00320
00321
00322 gene_bounds[0].push_back(31337);
00323 gene_bounds[0].push_back(31333);
00324 gene_bounds[0].push_back(31341);
00325 gene_bounds[0].push_back(31345);
00326 for( uint seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
00327 {
00328 cerr << "Checking seq " << seqI << " for errors\n";
00329 std::sort( gene_bounds[seqI].begin(), gene_bounds[seqI].end() );
00330 BbSeqEntrySorter bs(seqI);
00331 std::sort( bb_seq_list.begin(), bb_seq_list.end(), bs );
00332 size_t gI = 0;
00333 size_t bI = 0;
00334 cerr << gene_bounds[seqI].size() << " gene boundaries and " << bb_seq_list.size() << " bb segs\n";
00335 for( ; gI < gene_bounds[seqI].size() && bI < bb_seq_list.size(); gI++ )
00336 {
00337 cout << "checking " << bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << endl;
00338 while( bI < bb_seq_list.size() && gene_bounds[seqI][gI] > abs(bb_seq_list[bI][seqI].second) )
00339 bI++;
00340 if( bI == bb_seq_list.size() )
00341 break;
00342 if(abs(bb_seq_list[bI][seqI].first) + 1 < gene_bounds[seqI][gI] && gene_bounds[seqI][gI] < abs(bb_seq_list[bI][seqI].second) - 1)
00343 {
00344 cerr << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " violates gene boundary " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;
00345 }else
00346 cout << "segment " <<bb_seq_list[bI][seqI].first << ", " <<bb_seq_list[bI][seqI].second << " is okay for " << gene_bounds[seqI][gI] << " in seq " << seqI << endl;
00347 }
00348 }
00349
00350
00351
00352 bbseq_input.close();
00353 bb_output.open(output_fname.c_str());
00354 writeBackboneSeqFile( bb_output, bb_seq_list );
00355
00356 return 0;
00357 }
00358