00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/MatchList.h"
00014 #include <string>
00015 #include <fstream>
00016 #include <sstream>
00017 #include <map>
00018 #include "libMems/IntervalList.h"
00019 #include "libGenome/gnFilter.h"
00020 #include <boost/program_options/cmdline.hpp>
00021 #include <boost/program_options.hpp>
00022 namespace po = boost::program_options;
00023
00024 using namespace std;
00025 using namespace genome;
00026 using namespace mems;
00027
00028 class AluRecord
00029 {
00030
00031 public:
00032
00033 AluRecord();
00034
00035 int score;
00036
00037 float divergent;
00038
00039 float deleted;
00040
00041 float inserted;
00042
00043 string queryId;
00044
00045 gnSeqI start;
00046
00047 gnSeqI end;
00048
00049 gnSeqI remaining;
00050
00051 string strand;
00052
00053 string repeatId;
00054
00055 string repeatClass;
00056
00057
00058
00059 gnSeqI prior;
00060
00061 gnSeqI startDB;
00062
00063 gnSeqI endDB;
00064 gnSeqI length(void);
00065 };
00066
00067 gnSeqI AluRecord::length(void)
00068 {
00069 gnSeqI len = 0;
00070 len = absolut((int64)end)-absolut((int64)start);
00071 return len;
00072 }
00073 AluRecord::AluRecord()
00074 {
00075 score = 0;
00076 divergent = 0.0;
00077 deleted = 0.0;
00078 inserted = 0.0;
00079 queryId = "none";
00080 start = 0;
00081 end = 0;
00082 remaining = 0;
00083 strand = "+";
00084 repeatId = "none";
00085 repeatClass = "none";
00086 prior = 0;
00087 startDB = 0;
00088 endDB = 0;
00089 }
00090 void ReadAluFile( istream& in_stream, vector<AluRecord*>& alu_list, gnSeqI& lr )
00091 {
00092 uint seq_count = 0;
00093 gnSeqI max_len = 0;
00094 string cur_line;
00095
00096 getline( in_stream, cur_line );
00097 getline( in_stream, cur_line);
00098 getline( in_stream, cur_line);
00099 uint seqI = 0;
00100 vector< gnSeqI > lengths;
00101
00102
00103 string empty_line;
00104 vector< string > aln_mat;
00105 uint line_count = 1;
00106
00107
00108 while( getline( in_stream, cur_line) )
00109 {
00110
00111 AluRecord* alu = new AluRecord();
00112
00113
00114
00115
00116
00117
00118 string::size_type loc = cur_line.find("(");
00119 if (loc != string::npos )
00120 cur_line.replace(loc,1," ");
00121
00122 loc = cur_line.find(")");
00123 if (loc != string::npos )
00124 cur_line.replace(loc,1," ");
00125 stringstream parse_str( cur_line );
00126
00127 parse_str >> alu->score;
00128 parse_str >> alu->divergent;
00129 parse_str >> alu->deleted;
00130 parse_str >> alu->inserted;
00131 parse_str >> alu->queryId;
00132 parse_str >> alu->start;
00133 parse_str >> alu->end;
00134 parse_str >> alu->remaining;
00135 parse_str >> alu->strand;
00136 parse_str >> alu->repeatId;
00137 parse_str >> alu->repeatClass;
00138
00139
00140
00141
00142
00143
00144 alu_list.push_back(alu);
00145 lr+= alu->length();
00146
00147 }
00148 cout << "number of ALU records in file: " << alu_list.size() << endl;
00149 }
00150
00156 int main( int argc, char* argv[] ){
00157
00158 string alignment_fname;
00159 string alu_fname;
00160
00161
00162 if( argc < 2 ){
00163 cout << "scoreALU <procrastAligner alignment> <repeatmasker ALUs>\n";
00164 return -1;
00165 }
00166
00167
00168 po::variables_map vm;
00169 try {
00170
00171 po::options_description desc("Allowed options");
00172 desc.add_options()
00173 ("help", "get help message")
00174 ("alignment", po::value<string>(&alignment_fname), "procrastAligner alignment")
00175 ("alus", po::value<string>(&alu_fname), "repeatmasker ALUs")
00176 ;
00177
00178
00179 po::store(po::parse_command_line(argc, argv, desc), vm);
00180 po::notify(vm);
00181
00182 if (vm.count("help")) {
00183 cout << desc << "\n";
00184 return 1;
00185 }
00186
00187
00188 }
00189 catch(exception& e) {
00190 cerr << "error: " << e.what() << "\n";
00191 return 1;
00192 }
00193 catch(...) {
00194 cerr << "Exception of unknown type!\n";
00195 }
00196
00197
00198
00199 ifstream align_in;
00200 align_in.open( alignment_fname.c_str() );
00201 if( !align_in.is_open() ){
00202 cerr << "Error opening " << alignment_fname << endl;
00203 return -1;
00204 }
00205 ifstream alu_in;
00206 alu_in.open( alu_fname.c_str() );
00207 if( !alu_in.is_open() ){
00208 cerr << "Error opening " << alu_fname << endl;
00209 return -1;
00210 }
00211 try{
00212 cout << "Calclutating specificity and sensitivity of procrastAligner on dataset..." << endl;
00213 IntervalList alignment;
00214 vector<AluRecord*> alus;
00215
00216
00217 gnSeqI lt = 0;
00218
00219 gnSeqI ld = 0;
00220
00221 gnSeqI lr=0;
00222 gnSeqI ln=0;
00223 gnSeqI lp=0;
00224
00225 gnSeqI lo=0;
00226
00227 gnSeqI lc = 0;
00228 ReadAluFile( alu_in, alus, lr );
00229 alu_in.close();
00230 string cur_line;
00231 uint seqI = 0;
00232
00233
00234
00235 pair<int64,int64> pos;
00236 vector< vector< pair<int64,int64> > > align_list;
00237 vector< pair<int64,int64> > pos_list;
00238 map<int64,bool> alncoverage;
00239 map<int64,bool> coverage;
00240
00241 vector< map<int64,bool> > totcoverage;
00242 int64 ccount = 0;
00243 while( getline( align_in, cur_line) )
00244 {
00245 vector< int64 > start_list;
00246 getline( align_in, cur_line);
00247 stringstream parse_str( cur_line );
00248 int64 start = 0;
00249 int64 end = 0;
00250 int64 length = 0;
00251 string aln_len_str;
00252 parse_str >> aln_len_str;
00253 while( parse_str >> start )
00254 {
00255 start_list.push_back(start);
00256 }
00257 getline( align_in, cur_line);
00258 stringstream parse_string(cur_line);
00259
00260 string lens;
00261 parse_string >> lens;
00262 uint region_count = 0;
00263 while( parse_string >> length )
00264 {
00265
00266 if ( region_count >= start_list.size() )
00267 {
00268
00269 cout << "alu data failed!" << endl;
00270 break;
00271 }
00272 pos.first = start_list.at(region_count);
00273 if (start_list.at(region_count) < 0 )
00274 {
00275 pos.second = start_list.at(region_count)-length;
00276
00277 for(int i = 0; i < length; i++)
00278 {
00279 alncoverage[pos.first-i] = true;
00280 coverage[pos.first-i] = true;
00281 ccount++;
00282 }
00283 }
00284 else
00285 {
00286 pos.second = start_list.at(region_count)+length;
00287
00288 for(int i = 0; i < length; i++)
00289 {
00290 alncoverage[pos.first+i] = true;
00291 coverage[pos.first+i] = true;
00292 ccount++;
00293 }
00294 }
00295 pos_list.push_back(pos);
00296 region_count++;
00297 }
00298 totcoverage.push_back(alncoverage);
00299 alncoverage.clear();
00300 align_list.push_back(pos_list);
00301 pos_list.clear();
00302
00303 }
00304
00305 align_in.close();
00306 cout << "alu data processed!" << endl;
00307 int aluhits = 0;
00308 int matches = 0;
00309
00310
00311
00312
00313
00314 map<int,bool> ignoreAlignment;
00315 map<int64,bool> mergedCoverage;
00316 map<int64,bool> aluCoverage;
00317
00318
00319 map<int64,bool> lpt;
00320 map<int64,bool> lpn;
00321
00322
00323 map<int64,bool> lpo;
00324
00325
00326 map<int,bool> hitlist;
00327
00328 map<int64,bool> specificity;
00329
00330 map< uint,pair<int,int> > best_borders;
00331 map< uint,pair<int,int> > worst_borders;
00332 int64 matchhits = 0;
00333 int64 matchhitmult = 0;
00334 cout << "checking which alus are aligned" << endl;
00335 for ( int j = 0; j < align_list.size(); j++)
00336 {
00337
00338
00339
00340
00341
00342
00343
00344 bool alufound = false;
00345
00346
00347 for ( int i = 0; i < alus.size(); i++)
00348 {
00349
00350
00351 if (alus.at(i)->strand == "+" )
00352 {
00353 for ( int a = 0; a < alus.at(i)->length(); a++)
00354 {
00355
00356
00357
00358 if(totcoverage.at(j).find((alus.at(i)->start)+a) != totcoverage.at(j).end())
00359 {
00360
00361 alufound = true;
00362
00363 if(aluCoverage.find((alus.at(i)->start)+a) == aluCoverage.end())
00364 {
00365 lc+=1;
00366
00367
00368
00369 }
00370 hitlist[i] = true;
00371 aluCoverage[(alus.at(i)->start)+a] = true;
00372
00373 }
00374
00375 }
00376 }
00377 else
00378 {
00379 for ( int a = 0; a < alus.at(i)->length(); a++)
00380 {
00381 if(totcoverage.at(j).find(-1*((alus.at(i)->start)+a)) != totcoverage.at(j).end())
00382 {
00383 if(aluCoverage.find(-1*((alus.at(i)->start)+a)) == aluCoverage.end())
00384 {
00385 lc+=1;
00386
00387
00388 }
00389 hitlist[i] = true;
00390 aluCoverage[-1*((alus.at(i)->start)+a)] = true;
00391
00392 alufound =true;
00393 }
00394
00395 }
00396 }
00397 }
00398 if(!alufound)
00399 {
00400 ignoreAlignment[j] = true;
00401 cout << "ignoring alignment " << j << endl;
00402
00403
00404 for(int k = 0; k < align_list.at(j).size();k++)
00405 {
00406 gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00407 for(int n = 0; n<len;n++)
00408 {
00409 if(align_list.at(j).at(k).first<0)
00410 lpo[align_list.at(j).at(k).first-n] = true;
00411 else
00412 lpo[align_list.at(j).at(k).first+n] = true;
00413 }
00414 }
00415 }
00416 else
00417 {
00418
00419 bool hit = false;
00420 bool debug_pos = false;
00421 bool inall = true;
00422 uint rnum = 0;
00423 for(int k = 0; k < align_list.at(j).size();k++)
00424 {
00425 gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00426 for(int n = 0; n<len;n++)
00427 {
00428 if(align_list.at(j).at(k).first<0)
00429 {
00430
00431
00432
00433 if(aluCoverage.find(align_list.at(j).at(k).first-n)!= aluCoverage.end())
00434 {
00435
00436 for ( int i = 0; i < alus.size(); i++)
00437 {
00438
00439 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
00440 || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) < alus.at(i)->end ) )
00441 {
00442
00443 if (rnum != i+1 && rnum != 0)
00444 inall = false;
00445 rnum = i+1;
00446 break;
00447 }
00448 }
00449
00450
00451 lpn[align_list.at(j).at(k).first-n] = true;
00452 hit = true;
00453 }
00454
00455 else
00456 {
00457 lpt[align_list.at(j).at(k).first-n] = true;
00458 rnum = -1;
00459 }
00460 mergedCoverage[align_list.at(j).at(k).first-n] = true;
00461 }
00462 else
00463 {
00464 if(aluCoverage.find(align_list.at(j).at(k).first+n)!= aluCoverage.end())
00465 {
00466
00467 for ( int i = 0; i < alus.size(); i++)
00468 {
00469
00470 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
00471 || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00472 {
00473
00474
00475 if (rnum != i+1 && rnum != 0)
00476 inall = false;
00477 rnum = i+1;
00478 break;
00479 }
00480 }
00481
00482 lpn[align_list.at(j).at(k).first+n] = true;
00483 hit = true;
00484 }
00485
00486 else
00487 {
00488 lpt[align_list.at(j).at(k).first+n] = true;
00489 rnum = -1;
00490 }
00491 mergedCoverage[align_list.at(j).at(k).first+n] = true;
00492 }
00493 }
00494 if (rnum <= 0)
00495 inall = false;
00496
00497 if(hit)
00498 {
00499 matchhits+=1;
00500
00501 }
00502 }
00503
00504 if (inall || 1)
00505 {
00506 for(int k = 0; k < align_list.at(j).size();k++)
00507 {
00508 gnSeqI len = absolut((int64)align_list.at(j).at(k).second)-absolut((int64)align_list.at(j).at(k).first);
00509 uint rnum = 0;
00510
00511 if(align_list.at(j).at(k).first<0)
00512 {
00513
00514
00515
00516
00517
00518 for ( int i = 0; i < alus.size(); i++)
00519 {
00520
00521 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
00522 || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00523 {
00524
00525 rnum = i+1;
00526
00527 int leftend = 0;
00528 int rightend = 0;
00529 leftend = abs((int)alus.at(i)->start)-abs((int)align_list.at(j).at(k).first);
00530 rightend = abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
00531 if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
00532 {
00533 cout << "alu\talignment" << endl;
00534 cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
00535 cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
00536
00537 }
00538
00539 if ( worst_borders.find( rnum ) != worst_borders.end() )
00540 {
00541
00542 if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
00543 worst_borders[rnum].first = leftend;
00544 if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
00545 worst_borders[rnum].second = rightend;
00546 if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
00547 best_borders[rnum].first = leftend;
00548 if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
00549 best_borders[rnum].second = rightend;
00550 }
00551 else
00552 {
00553 worst_borders[rnum] = make_pair(leftend,rightend);
00554 best_borders[rnum] = make_pair(leftend,rightend);
00555 }
00556
00557 break;
00558 }
00559 }
00560 }
00561 else
00562 {
00563
00564 for ( int i = 0; i < alus.size(); i++)
00565 {
00566
00567
00568
00569
00570 if( (abs((int)align_list.at(j).at(k).first) >= alus.at(i)->start) && (abs((int)align_list.at(j).at(k).first) < alus.at(i)->end )
00571 || (abs((int)align_list.at(j).at(k).second) > alus.at(i)->start) && (abs((int)align_list.at(j).at(k).second) <= alus.at(i)->end ) )
00572 {
00573
00574 rnum = i+1;
00575
00576 int leftend = 0;
00577 int rightend = 0;
00578
00579 leftend = abs((int)alus.at(i)->start) -abs((int)align_list.at(j).at(k).first);
00580 rightend = abs((int)alus.at(i)->end)-abs((int)align_list.at(j).at(k).second);
00581
00582 if (debug_pos && (abs(leftend)>500 || abs(rightend)>500))
00583 {
00584 cout << "alu\talignment" << endl;
00585 cout << alus.at(i)->start << "\t" << align_list.at(j).at(k).first << endl;
00586 cout << alus.at(i)->end << "\t" << align_list.at(j).at(k).second << endl;
00587
00588 }
00589
00590 if ( worst_borders.find( rnum ) != worst_borders.end() )
00591 {
00592
00593 if ( abs((int)worst_borders[rnum].first) < abs((int)leftend) )
00594 worst_borders[rnum].first = leftend;
00595 if ( abs((int)worst_borders[rnum].second) < abs((int)rightend) )
00596 worst_borders[rnum].second = rightend;
00597
00598
00599 if ( abs((int)best_borders[rnum].first) > abs((int)leftend) )
00600 best_borders[rnum].first = leftend;
00601 if ( abs((int)best_borders[rnum].second) > abs((int)rightend) )
00602 best_borders[rnum].second = rightend;
00603
00604 }
00605 else
00606 {
00607 worst_borders[rnum] = make_pair(leftend,rightend);
00608 best_borders[rnum] = make_pair(leftend,rightend);
00609 }
00610
00611 break;
00612 }
00613 }
00614
00615 }
00616
00617 }
00618 }
00619
00620 }
00621 alufound = false;
00622 }
00623 gnSequence empty_seq;
00624
00625
00626
00627 ofstream boundary_file;
00628 alignment_fname.append(".boundary");
00629 boundary_file.open(alignment_fname.c_str());
00630 map< uint,pair<int,int> >::iterator iter;
00631 uint avg_worst_left = 0;
00632 uint avg_worst_right = 0;
00633 uint avg_best_left = 0;
00634 uint avg_best_right = 0;
00635 for( iter = worst_borders.begin(); iter != worst_borders.end(); iter++ )
00636 {
00637 avg_worst_left += abs(iter->second.first);
00638 avg_worst_right += abs(iter->second.second);
00639 boundary_file << "worst boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
00640 }
00641 for( iter = best_borders.begin(); iter != best_borders.end(); iter++ )
00642 {
00643 avg_best_left += abs( iter->second.first);
00644 avg_best_right += abs(iter->second.second);
00645 boundary_file << "best boundaries for repeat copy #" << iter->first << "\t left: " << iter->second.first << "\t right: " << iter->second.second << endl;
00646 }
00647
00648 if (worst_borders.size() > 0 )
00649 {
00650 avg_worst_left /= worst_borders.size();
00651 avg_worst_right /= worst_borders.size();
00652 }
00653 else
00654 {
00655 avg_worst_left = -1;
00656 avg_worst_right = -1;
00657
00658 }
00659 if ( best_borders.size() > 0)
00660 {
00661 avg_best_left /= best_borders.size();
00662 avg_best_right /= best_borders.size();
00663 }
00664 else
00665 {
00666 avg_best_left = -1;
00667 avg_best_right = -1;
00668
00669 }
00670 boundary_file << "left best: \t" << avg_best_left << endl;
00671 boundary_file << "right best: \t" << avg_best_right << endl;
00672 boundary_file << "left worst: \t" << avg_worst_left << endl;
00673 boundary_file << "right worst: \t" << avg_worst_right << endl;
00674 boundary_file << "#" << endl;
00675 boundary_file.close();
00676
00677 lt = mergedCoverage.size();
00678
00679 ld = coverage.size();
00680 lp = lpt.size();
00681 ln = lpn.size();
00682 lo = lpo.size();
00683
00684
00685 gnSeqI hitlength =0;
00686 for(int i =0; i< hitlist.size(); i++)
00687 hitlength+= alus.at(i)->length();
00688
00689 cout << "\nprocrastAlignments processed: " << align_list.size() << endl;
00690 cout << "matches processed: " << matches << endl;
00691 cout << "Total ALUs found by repeatmasker: " << alus.size() << endl;
00692 cout << "Total ALUs hit by procrastAligner: " << hitlist.size() << endl;
00693 cout << "ALU hit percentage: " << (float)hitlist.size()/(float)alus.size() << endl;
00694
00695
00696 cout << "\nTotal length of all repeats found by procrastAligner: " << ld << endl;
00697 cout << "Total length of all regions found only in procrastAligner: " << lo << endl;
00698 cout << "Total length of all (partially) aligned repeats found by procrastAligner: lt = " << lt << endl;
00699 cout << "Total length of unaligned repeats(false positives?) found by procrastAligner: lp = " << lp << endl;
00700
00701 cout << "Total length of all repeats(ALU) found by repeatmasker: lr = " << lr << endl;
00702 cout << "Total length of repeats(ALU) found by repeatmasker hit by procrastAligner: lh =" << hitlength << endl;
00703 cout << "Total length of ALU repeats found by both methods: lc = " << lc << endl;
00704
00705
00706
00707
00708
00709 cout << "\nSensitivity-old: lc / lh = " << (double)(lc) / (double)(hitlength) << endl;
00710 cout << "Specificity-old: lc / lt = " << (double)(lc) / (double)(lt) << endl;
00711
00712
00713 cout << "\nSensitivity= " << (double)hitlist.size()/(double)alus.size() << endl;
00714 cout << "Specificity= " << (double)matchhits/(double)matchhitmult << endl;
00715
00716
00717
00718
00719
00720
00721 }catch( gnException& gne ){
00722 cerr << gne << endl;
00723 }catch( exception& e ){
00724 cerr << e.what() << endl;
00725 }
00726
00727 }
00728
00729