src/scoreALU.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: scoreAlignment.cpp,v 1.14 2004/02/28 00:01:31 darling Exp $
00003  * This file is copyright 2002-2004 Aaron Darling.  All rights reserved.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * rights.  Redistribution of this file, in whole or in part is prohibited
00006  * without express permission.
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/MatchList.h"
00014 #include <string>
00015 #include <fstream>
00016 #include <sstream>
00017 #include <map>
00018 #include "libMems/IntervalList.h"
00019 #include "libGenome/gnFilter.h"
00020 #include <boost/program_options/cmdline.hpp>
00021 #include <boost/program_options.hpp>
00022 namespace po = boost::program_options;
00023 
00024 using namespace std;
00025 using namespace genome;
00026 using namespace mems;
00027 
00028 class AluRecord
00029 {
00030 
00031 public:
00032 
00033         AluRecord();
00034         //Smith-Waterman score of the match
00035         int score;
00036         //% substitutions in matching region compared to the consensus
00037         float divergent;
00038         //% deleted bp
00039         float deleted;
00040         //%     inserted bp
00041         float inserted;
00042         //name of query sequence
00043         string queryId;
00044         //starting position of match in query sequence
00045         gnSeqI start;
00046         //ending position of match in query sequence
00047         gnSeqI end;
00048         //no. of bases in query sequence past the ending position of match
00049         gnSeqI remaining;
00050         //match is with the Complement of the consensus sequence in the database
00051         string strand;
00052         //name of the matching interspersed repeat
00053         string repeatId;
00054         //class of the matching repeat
00055         string repeatClass;
00056         //no. of bases in (complement of) the repeat consensus sequence 
00057     //prior to beginning of the match (so 0 means that the match extended 
00058     //all the way to the end of the repeat consensus sequence)
00059         gnSeqI prior;
00060         //starting position of match in database sequence
00061         gnSeqI startDB;
00062         //ending position of match in database sequence
00063         gnSeqI endDB;
00064         gnSeqI length(void);
00065 };
00066 
00067 gnSeqI AluRecord::length(void)
00068 {
00069         gnSeqI len = 0;
00070         len = absolut((int64)end)-absolut((int64)start);
00071         return len;
00072 }
00073 AluRecord::AluRecord()
00074 {
00075         score = 0;
00076         divergent = 0.0;
00077         deleted = 0.0;
00078         inserted = 0.0;
00079         queryId = "none";
00080         start = 0;
00081         end = 0;
00082         remaining = 0;
00083         strand = "+";
00084         repeatId = "none";
00085         repeatClass = "none";
00086         prior = 0;
00087         startDB = 0;
00088         endDB = 0;
00089 }
00090 void ReadAluFile( istream& in_stream, vector<AluRecord*>& alu_list, gnSeqI& lr ) 
00091 {
00092         uint seq_count = 0;
00093         gnSeqI max_len = 0;
00094         string cur_line;
00095         //3 lines of header info
00096         getline( in_stream, cur_line );
00097         getline( in_stream, cur_line);
00098         getline( in_stream, cur_line);
00099         uint seqI = 0;
00100         vector< gnSeqI > lengths;
00101         //vector< AluRecord* > alu_list;
00102         
00103         string empty_line;
00104         vector< string > aln_mat;
00105         uint line_count = 1;
00106 
00107         
00108         while( getline( in_stream, cur_line) )
00109         {
00110                 
00111                 AluRecord* alu = new AluRecord();
00112                 // read and parse first AluRecord line
00113                 //stringstream line_str( cur_line );
00114                 
00115                 //first line of data
00116                 
00117                 // take off leading whitespace
00118                 string::size_type loc = cur_line.find("(");
00119                 if (loc != string::npos )
00120                         cur_line.replace(loc,1," ");
00121                 
00122                 loc = cur_line.find(")");
00123                 if (loc != string::npos )
00124                         cur_line.replace(loc,1," ");
00125                 stringstream parse_str( cur_line );
00126                 
00127                 parse_str >> alu->score;        
00128                 parse_str >> alu->divergent;
00129                 parse_str >> alu->deleted;
00130                 parse_str >> alu->inserted;
00131                 parse_str >> alu->queryId;
00132                 parse_str >> alu->start;
00133                 parse_str >> alu->end;
00134                 parse_str >> alu->remaining;
00135                 parse_str >> alu->strand;
00136                 parse_str >> alu->repeatId;
00137                 parse_str >> alu->repeatClass;
00138                 //punt: rest of info not needed
00139                 //parse_str >> alu->prior;
00140                 //parse_str >> alu->startDB;
00141                 //parse_str >> alu->endDB;
00142                 
00143                 //end of line
00144                 alu_list.push_back(alu);
00145                 lr+= alu->length();
00146 
00147         }
00148         cout << "number of ALU records in file: " << alu_list.size() << endl;
00149 }
00150 
00156 int main( int argc, char* argv[] ){
00157         
00158         string alignment_fname;
00159         string alu_fname;
00160         
00161         
00162         if( argc < 2 ){
00163                 cout << "scoreALU <procrastAligner alignment> <repeatmasker ALUs>\n";
00164                 return -1;
00165         }
00166         // Declare the supported options.
00167         
00168         po::variables_map vm;
00169         try {
00170 
00171         po::options_description desc("Allowed options");
00172         desc.add_options()
00173             ("help", "get help message")
00174             ("alignment", po::value<string>(&alignment_fname), "procrastAligner alignment")
00175                         ("alus", po::value<string>(&alu_fname), "repeatmasker ALUs")
00176         ;
00177 
00178                 
00179         po::store(po::parse_command_line(argc, argv, desc), vm);
00180         po::notify(vm);    
00181 
00182         if (vm.count("help")) {
00183             cout << desc << "\n";
00184             return 1;
00185         }
00186 
00187         
00188     }
00189     catch(exception& e) {
00190         cerr << "error: " << e.what() << "\n";
00191         return 1;
00192     }
00193     catch(...) {
00194         cerr << "Exception of unknown type!\n";
00195     }
00196         
00197         
00198 
00199         ifstream align_in;
00200         align_in.open( alignment_fname.c_str() );
00201         if( !align_in.is_open() ){
00202                 cerr << "Error opening " << alignment_fname << endl;
00203                 return -1;
00204         }
00205         ifstream alu_in;
00206         alu_in.open( alu_fname.c_str() );
00207         if( !alu_in.is_open() ){
00208                 cerr << "Error opening " << alu_fname << endl;
00209                 return -1;
00210         }
00211 try{
00212         cout << "Calclutating specificity and sensitivity of procrastAligner on dataset..." << endl;
00213         IntervalList alignment;
00214         vector<AluRecord*> alus;
00215         
00216         //total length of all aligned repeats found by procrastAligner
00217         gnSeqI lt = 0;
00218         //total length of all alignments found by procrastAligner
00219         gnSeqI ld = 0;
00220         //total length of repeats found by repeatmasker
00221         gnSeqI lr=0;
00222         gnSeqI ln=0;
00223         gnSeqI lp=0;
00224         //total length of all regions found only in procrastAligner
00225         gnSeqI lo=0;
00226         //total length of repeats masked by both programs
00227         gnSeqI lc = 0;
00228         ReadAluFile( alu_in, alus, lr );
00229         alu_in.close();
00230         string cur_line;
00231         uint seqI = 0;
00232     //this will suffice for now, but should plan on using
00233         //IntervalList::ReadStandardAlignment or equivalent
00234         //to read in XMFA formatted output from procrastAligner
00235         pair<int64,int64> pos;
00236         vector< vector< pair<int64,int64> > > align_list;
00237         vector< pair<int64,int64> > pos_list;
00238         map<int64,bool> alncoverage;
00239     map<int64,bool> coverage;
00240         //list of maps, one for each alignment
00241         vector< map<int64,bool> > totcoverage;
00242         int64 ccount = 0;
00243         while( getline( align_in, cur_line) )
00244         {
00245                 vector< int64 > start_list;
00246                 getline( align_in, cur_line);
00247                 stringstream parse_str( cur_line );
00248                 int64 start = 0;
00249                 int64 end = 0;
00250                 int64 length = 0;
00251                 string aln_len_str;
00252                 parse_str >> aln_len_str;
00253                 while( parse_str >> start )
00254                 {
00255                         start_list.push_back(start);
00256                 }
00257                 getline( align_in, cur_line);
00258                 stringstream parse_string(cur_line);
00259                 //parse_str.( cur_line );
00260                 string lens;
00261                 parse_string >> lens;
00262                 uint region_count = 0;  
00263                 while( parse_string >> length )
00264                 {
00265                         //cout << length << endl;
00266                         if ( region_count >= start_list.size() )
00267                         {
00268                                 //something's wrong
00269                                 cout << "alu data failed!" << endl;
00270                                 break;
00271                         }
00272                         pos.first = start_list.at(region_count);
00273                         if (start_list.at(region_count) < 0 )
00274                         {
00275                                 pos.second = start_list.at(region_count)-length;
00276                                 //simply add up the alignment coverage in the map
00277                                 for(int i = 0; i < length; i++)
00278                                 {
00279                                         alncoverage[pos.first-i] = true;
00280                                         coverage[pos.first-i] = true;           
00281                                         ccount++;
00282                                 }
00283                         }
00284                         else
00285                         {
00286                                 pos.second = start_list.at(region_count)+length;
00287                                 //for both strands
00288                                 for(int i = 0; i < length; i++)
00289                                 {       
00290                                         alncoverage[pos.first+i] = true;
00291                                         coverage[pos.first+i] = true;
00292                                         ccount++;
00293                                 }
00294                         }
00295                         pos_list.push_back(pos);
00296                         region_count++;
00297                 }
00298                 totcoverage.push_back(alncoverage);
00299                 alncoverage.clear();
00300                 align_list.push_back(pos_list);
00301                 pos_list.clear();
00302                 
00303         }//end of read procrastAligner output hack
00304         //alignment.ReadStandardAlignment( align_in );
00305         align_in.close();
00306         cout << "alu data processed!" << endl;
00307         int aluhits = 0;
00308         int matches = 0;
00309         //a first attempt at generating the sensitivity & specificity of our method
00310         //for comparison with zhang&waterman's eulerian path method...
00311         //hopefully we pull out these ~290bp repeats in a nice chain in each case
00312         //FIXME: is this ok?
00313 
00314         map<int,bool> ignoreAlignment;
00315         map<int64,bool> mergedCoverage;
00316         map<int64,bool> aluCoverage;
00317 
00318         //Total length of unaligned repeats(false positives?) found by procrastAligner
00319         map<int64,bool> lpt;
00320         map<int64,bool> lpn;
00321 
00322         //Total length of regions found only in procrastAligner
00323         map<int64,bool> lpo;
00324 
00325         
00326         map<int,bool> hitlist;
00327 
00328         map<int64,bool> specificity;
00329 
00330         map< uint,pair<int,int> > best_borders;
00331         map< uint,pair<int,int> > worst_borders;
00332         int64 matchhits = 0;
00333     int64 matchhitmult = 0;
00334         cout << "checking which alus are aligned" << endl;
00335         for ( int j = 0; j < align_list.size(); j++)
00336         {
00337                 //if alufound in any component of curr alignment, consider 'aligned'
00338                 //if not, throw out to help our sr. specificity
00339                 
00340                 //then, for each ALU, see if it is 'covered' by our procrastAlignments.
00341                 //if so, increase lc2 accordingly
00342 
00343                 //for each alignment returned by procrastAligner, highest multiplicity first
00344                 bool alufound = false;
00345 
00346                 //cout << "checking alignment #" << j << " for ALUs..." << endl;
00347                 for ( int i = 0; i < alus.size(); i++)
00348                 {
00349                         
00350                         //lpt = 0;
00351                         if (alus.at(i)->strand == "+" )
00352                         {
00353                                 for ( int a = 0; a < alus.at(i)->length(); a++)
00354                                 {
00355                                         
00356 
00357                                         //column in alignment coincides with an alu
00358                                         if(totcoverage.at(j).find((alus.at(i)->start)+a) != totcoverage.at(j).end())
00359                                         {
00360                                                 
00361                                                 alufound = true;
00362                                                 //this column in sequence not accounted for
00363                                                 if(aluCoverage.find((alus.at(i)->start)+a) == aluCoverage.end())
00364                                                 {       
00365                                                         lc+=1;
00366                                                         
00367                                                         //now it is
00368                                                         
00369                                                 }
00370                                                 hitlist[i] = true;
00371                                                 aluCoverage[(alus.at(i)->start)+a] = true;
00372 
00373                                         }
00374                                         
00375                                 }
00376                         }
00377                         else
00378                         {
00379                                 for ( int a = 0; a < alus.at(i)->length(); a++)
00380                                 {
00381                                         if(totcoverage.at(j).find(-1*((alus.at(i)->start)+a)) != totcoverage.at(j).end())
00382                                         {
00383                                                 if(aluCoverage.find(-1*((alus.at(i)->start)+a)) == aluCoverage.end())
00384                                                 {       
00385                                                         lc+=1;
00386                                                         
00387                                                         
00388                                                 }
00389                                                 hitlist[i] = true;
00390                                                 aluCoverage[-1*((alus.at(i)->start)+a)] = true;
00391                                                 //lc+=1;
00392                                                 alufound =true;
00393                                         }
00394                                         
00395                                 }
00396                         }
00397                 }
00398                 if(!alufound)
00399                 {
00400                         ignoreAlignment[j] = true;
00401                         cout << "ignoring alignment " << j << endl;
00402                         
00403                         //calculate regions only appearing in procrastAligner alignments
00404                         for(int k = 0; k < align_list.at(j).size();k++)
00405                         {                       
00406                                 gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00407                                 for(int n = 0; n<len;n++)
00408                                 {
00409                                         if(align_list.at(j).at(k).first<0)
00410                                                 lpo[align_list.at(j).at(k).first-n] = true;
00411                                         else
00412                                                 lpo[align_list.at(j).at(k).first+n] = true;
00413                                 }
00414                         }
00415                 }
00416                 else
00417                 {
00418                         //cout << "ALU was aligned!" << endl;
00419                         bool hit = false;
00420                         bool debug_pos = false;
00421                         bool inall = true;
00422                         uint rnum = 0;
00423                         for(int k = 0; k < align_list.at(j).size();k++)
00424                         {                       
00425                                 gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00426                                 for(int n = 0; n<len;n++)
00427                                 {
00428                                         if(align_list.at(j).at(k).first<0)
00429                                         {
00430                                                 // j = lma #
00431                                                 // k = component #
00432                                                 // first,second = start,end pos
00433                                                 if(aluCoverage.find(align_list.at(j).at(k).first-n)!= aluCoverage.end())
00434                                                 {
00435                                                         //find which alu is hit
00436                                                         for ( int i = 0; i < alus.size(); i++)
00437                                                         {       
00438                                                                 //is this ok for reverse strand?
00439                                                                 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end ) 
00440                                                                 ||  (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) < alus.at(i)->end ) )
00441                                                                 {
00442                                                                         //the repeat #
00443                                                                         if (rnum != i+1 && rnum != 0)
00444                                                                                 inall = false;
00445                                                                         rnum = i+1;                             
00446                                                                         break;
00447                                                                 }
00448                                                         }
00449                                                         //current component of alignment pertains to alu
00450                                                         //spec.at(j).push_back(k)
00451                                                         lpn[align_list.at(j).at(k).first-n] = true;
00452                                                         hit = true;
00453                                                 }
00454                                                 //motif missed by procrastAligner
00455                                                 else
00456                                                 {
00457                                                         lpt[align_list.at(j).at(k).first-n] = true;
00458                                                         rnum = -1;
00459                                                 }
00460                                                 mergedCoverage[align_list.at(j).at(k).first-n] = true;
00461                                         }
00462                                         else
00463                                         {
00464                                                 if(aluCoverage.find(align_list.at(j).at(k).first+n)!= aluCoverage.end())
00465                                                 {
00466                                                         //find out which alu is hit
00467                                                         for ( int i = 0; i < alus.size(); i++)
00468                                                         {
00469                                                                 
00470                                                                 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
00471                                                                 ||  (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00472                                                                 {
00473                                                                         //the repeat #
00474                                                                         //cout << rnum << " " << i+1 <<  endl;
00475                                                                         if (rnum != i+1 && rnum != 0)
00476                                                                                 inall = false;
00477                                                                         rnum = i+1;
00478                                                                         break;
00479                                                                 }
00480                                                         }       
00481                                                         //current component of alignment pertains to alu
00482                                                         lpn[align_list.at(j).at(k).first+n] = true;
00483                                                         hit = true;
00484                                                 }
00485                                                 //motif missed by procrastAligner
00486                                                 else
00487                                                 {
00488                                                         lpt[align_list.at(j).at(k).first+n] = true;
00489                                                         rnum = -1;
00490                                                 }
00491                                                 mergedCoverage[align_list.at(j).at(k).first+n] = true;
00492                                         }
00493                                 }
00494                                 if (rnum <= 0)
00495                                         inall = false;
00496 
00497                                 if(hit)
00498                                 {
00499                                         matchhits+=1;
00500                                         
00501                                 }
00502                         }
00503                         //punt: DONT need to first check if it hits all components!!
00504                         if (inall || 1)
00505                         {
00506                                 for(int k = 0; k < align_list.at(j).size();k++)
00507                                 {                       
00508                                         gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00509                                         uint rnum = 0;
00510                                         
00511                                         if(align_list.at(j).at(k).first<0)
00512                                         {
00513                                                 // j = lma #
00514                                                 // k = component #
00515                                                 // first,second = start,end pos
00516                                                 
00517                                                 //find which alu is hit
00518                                                 for ( int i = 0; i < alus.size(); i++)
00519                                                 {
00520                                                         //is this ok for reverse strand?
00521                                                         if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end ) 
00522                                                         ||  (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00523                                                         {
00524                                                                 //the repeat #
00525                                                                 rnum = i+1;
00526                                                                 //find overlap
00527                                                                 int leftend = 0;
00528                                                                 int rightend = 0;
00529                                                                 leftend = abs((int)alus.at(i)->start)-abs((int)align_list.at(j).at(k).first);
00530                                                                 rightend =   abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
00531                                                                 if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
00532                                                                 {
00533                                                                         cout << "alu\talignment" << endl;
00534                                                                         cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
00535                                                                         cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
00536 
00537                                                                 }
00538                                                                 
00539                                                                 if ( worst_borders.find( rnum ) != worst_borders.end() )
00540                                                                 {
00541                                                                         // if component has worse boundaries for this alu, record them
00542                                                                         if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
00543                                                                                 worst_borders[rnum].first = leftend;
00544                                                                         if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
00545                                                                                 worst_borders[rnum].second = rightend;
00546                                                                         if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
00547                                                                                 best_borders[rnum].first = leftend;
00548                                                                         if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
00549                                                                                 best_borders[rnum].second = rightend;
00550                                                                 }
00551                                                                 else
00552                                                                 {
00553                                                                         worst_borders[rnum] = make_pair(leftend,rightend);
00554                                                                         best_borders[rnum] = make_pair(leftend,rightend);
00555                                                                 }
00556                                                                 
00557                                                                 break;
00558                                                         }
00559                                                 } 
00560                                         }
00561                                         else
00562                                         {
00563                                                 //find out which alu is hit
00564                                                 for ( int i = 0; i < alus.size(); i++)
00565                                                 {
00566                                                         //if( (abs((int)align_list.at(j).at(k).first) <= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) >= alus.at(i)->end ) )
00567                                                         //if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00568                                                         //if( ((abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end ) && (abs((int)align_list.at(j).at(k).second) > alus.at(i)->end) )
00569                                                         //||  ((abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->start) ) )
00570                                                         if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end ) 
00571                                                         ||  (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00572                                                         {
00573                                                                 //the repeat #
00574                                                                 rnum = i+1;
00575                                                                 //find overlap
00576                                                                 int leftend = 0;
00577                                                                 int rightend = 0;
00578                                                                 
00579                                                                 leftend = abs((int)alus.at(i)->start) -abs((int)align_list.at(j).at(k).first);
00580                                                                 rightend =   abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
00581 
00582                                                                 if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
00583                                                                 {
00584                                                                         cout << "alu\talignment" << endl;
00585                                                                         cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
00586                                                                         cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
00587 
00588                                                                 }
00589                                                                 
00590                                                                 if ( worst_borders.find( rnum ) != worst_borders.end() )
00591                                                                 {
00592                                                                         // if component has worse boundaries for this alu, record them          
00593                                                                         if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
00594                                                                                 worst_borders[rnum].first = leftend;
00595                                                                         if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
00596                                                                                 worst_borders[rnum].second = rightend;
00597 
00598                                                                         // if component has better boundaries for this alu, record them
00599                                                                         if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
00600                                                                                 best_borders[rnum].first = leftend;
00601                                                                         if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
00602                                                                                 best_borders[rnum].second = rightend;
00603                                                                         
00604                                                                 }
00605                                                                 else
00606                                                                 {
00607                                                                         worst_borders[rnum] = make_pair(leftend,rightend);
00608                                                                         best_borders[rnum] = make_pair(leftend,rightend);
00609                                                                 }
00610                                                                 
00611                                                                 break;
00612                                                         }
00613                                                 }
00614                                         
00615                                         }
00616 
00617                                 }
00618                         }
00619                     
00620                 }
00621                 alufound = false;
00622         }
00623         gnSequence empty_seq;
00624         //this is the length of the repeats found by procrastAligner, 
00625         //with overlaps removed
00626         //remember the alignments to ignore!
00627         ofstream boundary_file;
00628         alignment_fname.append(".boundary");
00629         boundary_file.open(alignment_fname.c_str());
00630         map< uint,pair<int,int> >::iterator iter;
00631         uint avg_worst_left = 0;
00632         uint avg_worst_right = 0;
00633         uint avg_best_left = 0;
00634         uint avg_best_right = 0;
00635         for( iter = worst_borders.begin(); iter != worst_borders.end(); iter++ ) 
00636         {
00637                 avg_worst_left += abs(iter->second.first);
00638                 avg_worst_right += abs(iter->second.second);
00639                 boundary_file << "worst boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
00640         }
00641         for( iter = best_borders.begin(); iter != best_borders.end(); iter++ ) 
00642         {
00643                 avg_best_left += abs( iter->second.first);
00644                 avg_best_right += abs(iter->second.second);
00645                 boundary_file << "best boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
00646         }
00647 
00648         if (worst_borders.size() > 0 )
00649         {
00650                 avg_worst_left /= worst_borders.size();
00651                 avg_worst_right /= worst_borders.size();
00652         }
00653         else
00654         {
00655                 avg_worst_left = -1;
00656                 avg_worst_right = -1;
00657 
00658         }
00659         if ( best_borders.size() > 0)
00660         {
00661                 avg_best_left /= best_borders.size();
00662                 avg_best_right /= best_borders.size();
00663         }
00664         else
00665         {
00666                 avg_best_left = -1;
00667                 avg_best_right = -1;
00668 
00669         }
00670         boundary_file << "left best: \t" << avg_best_left << endl;
00671         boundary_file << "right best: \t" << avg_best_right << endl;
00672         boundary_file << "left worst: \t" << avg_worst_left << endl;
00673         boundary_file << "right worst: \t" << avg_worst_right << endl;
00674         boundary_file << "#" << endl;
00675         boundary_file.close();
00676 
00677         lt = mergedCoverage.size();
00678         //lt2 = coverage.size();
00679         ld = coverage.size();
00680         lp = lpt.size();
00681         ln = lpn.size();
00682         lo = lpo.size();
00683 
00684         //length of only ALUs hit by procrastAligner
00685         gnSeqI hitlength =0;
00686         for(int i =0; i< hitlist.size(); i++)
00687                 hitlength+= alus.at(i)->length();
00688 
00689         cout << "\nprocrastAlignments processed: " << align_list.size() << endl;
00690         cout << "matches processed: " << matches << endl;
00691         cout << "Total ALUs found by repeatmasker: " << alus.size() << endl;
00692         cout << "Total ALUs hit by procrastAligner: " << hitlist.size() << endl;
00693         cout << "ALU hit percentage: " << (float)hitlist.size()/(float)alus.size() << endl;
00694 
00695         //cout << aluCoverage.size() << endl;
00696     cout << "\nTotal length of all repeats found by procrastAligner: " << ld << endl;
00697          cout << "Total length of all regions found only in procrastAligner: " << lo << endl;
00698         cout << "Total length of all (partially) aligned repeats found by procrastAligner: lt = " << lt << endl;
00699         cout << "Total length of unaligned repeats(false positives?) found by procrastAligner: lp = " << lp << endl;
00700         //cout << "Total length of ???: ln = " << ln << endl;
00701         cout << "Total length of all repeats(ALU) found by repeatmasker: lr = " << lr << endl;
00702         cout << "Total length of repeats(ALU) found by repeatmasker hit by procrastAligner: lh =" << hitlength << endl;
00703         cout << "Total length of ALU repeats found by both methods: lc = " << lc << endl;
00704         
00705         //cout << "Sensitivity: lc / lr = " << (double)(lc) / (double)(lr) << endl;
00706         //cout << "Specificity: lc / lt = " << (double)(lc) / (double)(lt) << endl;
00707         
00708         //score changes per Sunday email, focus on filtration
00709         cout << "\nSensitivity-old: lc / lh = " << (double)(lc) / (double)(hitlength) << endl;
00710         cout << "Specificity-old: lc / lt = " << (double)(lc) / (double)(lt) << endl;
00711 
00712         
00713         cout << "\nSensitivity= " << (double)hitlist.size()/(double)alus.size() << endl;
00714         cout << "Specificity= " << (double)matchhits/(double)matchhitmult <<  endl;
00715 
00716 
00717         //TN = ltn
00718         //TP = lc
00719         //FN = lfn
00720         //FP = lp
00721 }catch( gnException& gne ){
00722         cerr << gne << endl;
00723 }catch( exception& e ){
00724         cerr << e.what() << endl;
00725 }
00726 
00727 }
00728 
00729 

Generated on Mon Aug 19 06:00:40 2013 for mauveAligner by doxygen 1.3.6