src/org/gel/mauve/assembly/ScoreAssembly.java

Go to the documentation of this file.
00001 package org.gel.mauve.assembly;
00002 
00003 import java.awt.Dimension;
00004 import java.io.File;
00005 import java.text.NumberFormat;
00006 import java.util.HashMap;
00007 import java.util.Iterator;
00008 import java.util.Map;
00009 
00010 import javax.swing.JOptionPane;
00011 import javax.swing.JTable;
00012 import javax.swing.JTextArea;
00013 import javax.swing.table.DefaultTableModel;
00014 
00015 import org.apache.commons.cli.CommandLine;
00016 import org.apache.commons.cli.HelpFormatter;
00017 import org.apache.commons.cli.Option;
00018 import org.apache.commons.cli.Options;
00019 import org.biojava.bio.symbol.SymbolList;
00020 import org.gel.mauve.BaseViewerModel;
00021 import org.gel.mauve.Chromosome;
00022 import org.gel.mauve.OptionsBuilder;
00023 import org.gel.mauve.XmfaViewerModel;
00024 import org.gel.mauve.analysis.BrokenCDS;
00025 import org.gel.mauve.analysis.Gap;
00026 import org.gel.mauve.analysis.SNP;
00027 import org.gel.mauve.contigs.ContigOrderer;
00028 import org.gel.mauve.gui.AlignFrame;
00029 import org.gel.mauve.gui.AlignWorker;
00030 import org.gel.mauve.gui.AnalysisDisplayWindow;
00031 import org.gel.mauve.gui.MauveFrame;
00032 
00033 public class ScoreAssembly  {
00034         
00035         private static final String[] DEFAULT_ARGS = {"--skip-refinement",
00036                                                                                                         "--weight=200"};
00037         private static String SUM_CMD = "Summary";
00038         private static String SUM_DESC = "Summary of scoring assembly";
00039         private static String SNP_CMD = "SNPs";
00040         private static String SNP_DESC = "SNPs between reference and assembly";
00041         private static String GAP_CMD = "Gaps";
00042         private static String GAP_DESC = "Gaps in reference and assembly";
00043         private static String CDS_CMD = "Broken CDS";
00044         private static String CDS_DESC = "Broken CDS in assembly genome";
00045         private static String INV_CMD = "Inverted Contigs";
00046         private static String INV_DESC = "Incorrectly inverted contigs";
00047         private static String MIS_CMD = "Mis-assembled Contigs";
00048         private static String MIS_DESC = "Mis-assembled contigs with the total number of mis-assemblies in each";
00049         
00050         private static HashMap<String,ScoreAssembly> modelMap = new HashMap<String, ScoreAssembly>();
00051         
00052         private static HashMap<String,ScoreAssembly> running = new HashMap<String, ScoreAssembly>();
00053 
00054         private static final String temp = "Running...";
00055         
00056         private static final String error = "Error computing DCJ distances! Please report bug to atritt@ucdavis.edu";
00057         
00058         private JTextArea sumTA;
00059         
00060         private XmfaViewerModel model;
00061         
00062         private int fWIDTH = 800;
00063 
00064         private int fHEIGHT = 510;
00065         
00066         private AssemblyScorer asmScore;
00067         
00068         private AnalysisDisplayWindow win;
00069                 
00070         private boolean finished;
00071         
00072         private boolean cancel;
00073         
00074         public ScoreAssembly(XmfaViewerModel model){
00075                 //init(model);
00076                 this.model = model;
00077                 this.finished = false;
00078                 initWithJTables(model);
00079         }
00080 
00081         private void initWithJTables(XmfaViewerModel model){
00082                 win = new AnalysisDisplayWindow("Score Assembly - "+model.getSrc().getName(), fWIDTH, fHEIGHT);
00083                 sumTA = win.addContentPanel(SUM_CMD, SUM_DESC, true);
00084                 sumTA.append(temp);
00085                 this.win.showWindow();
00086                 running.put(this.model.getSrc().getAbsolutePath(), this);
00087                 new Thread( new Runnable (){ 
00088                         public void run(){
00089                                 try {
00090                                 asmScore = new AssemblyScorer(ScoreAssembly.this.model);
00091                                 ScoreAssembly.this.finish();
00092                                 } catch (Exception e){
00093                                         ScoreAssembly.this.cancel();
00094                                         System.err.println("\nError\n");
00095                                         e.printStackTrace();
00096                                 }
00097                         }
00098                 }).start();
00099         }
00100         
00101         public void cancel(){
00102                 cancel = true;
00103         }
00104         
00105         private synchronized void finish(){
00106                 if (!cancel) {
00107                         running.remove(this.model.getSrc().getAbsolutePath());
00108                         modelMap.put(this.model.getSrc().getAbsolutePath(), this);
00109                         sumTA.replaceRange("", 0, temp.length());
00110                         sumTA.setText(getSumText(asmScore,true,false));
00111                         //addJTables();
00112                         addTables();
00113                         finished = true;
00114                         win.showWindow();
00115                         sumTA.setCaretPosition(0);
00116                 }
00117         }
00118         
00119         private void addTables(){
00120                 SNP[] snps = asmScore.getSNPs();
00121                 Object[] snpHeader = {"SNP_Pattern","Ref_Contig","Ref_PosInContig",
00122                                                         "Ref_PosGenomeWide","Assembly_Contig",
00123                                                         "Assembly_PosInContig","Assembly_PosGenomeWide"};
00124                 
00125                 DefaultTableModel snpData = win.addContentTable(SNP_CMD, SNP_DESC, false);
00126                 win.showWindow();
00127                 snpData.setColumnIdentifiers(snpHeader);
00128                 for (int snpI = 0; snpI < snps.length; snpI++){
00129                         snpData.addRow(snps[snpI].toString().split("\t"));
00130                 }
00131                 
00132                 Gap[] refGaps = asmScore.getReferenceGaps();
00133                 Gap[] assGaps = asmScore.getAssemblyGaps();
00134                 int nGen = model.numGenomes();
00135                 Object[] gapHeader = new Object[5+(nGen)*3];
00136                 gapHeader[0] = "Sequence";
00137                 gapHeader[1] = "Contig";
00138                 gapHeader[2] = "Position_in_Contig";
00139                 gapHeader[3] = "GenomeWide_Position";
00140                 gapHeader[4] = "Length";
00141                 int idx = 5;
00142                 for (int j = 0; j < nGen; j++){
00143                         gapHeader[idx++] = "sequence_"+j+"_pos"; 
00144                         gapHeader[idx++] = "sequence_"+j+"_ctg";
00145                         gapHeader[idx++] = "sequence_"+j+"_posInCtg";
00146                 }
00147                 DefaultTableModel gapData = win.addContentTable(GAP_CMD, GAP_DESC, false);
00148                 gapData.setColumnIdentifiers(gapHeader);
00149                 for (int gapI = 0; gapI < refGaps.length; gapI++)
00150                         gapData.addRow(refGaps[gapI].toString().split("\t"));
00151                 for (int gapI = 0; gapI < assGaps.length; gapI++)
00152                         gapData.addRow(assGaps[gapI].toString().split("\t"));
00153                 if (asmScore.hasBrokenCDS()){
00154                         Object[] cdsHeader = 
00155                                 {"CDS_ID","Peptide_Length",
00156                                         "Perc_IncorrectBases", "Broken_Frame_Segments",
00157                                         "Gap_Segments","Substituted_Positions","Substitutions",
00158                                         "Stop_Codon_Positions","Original_Residue"};
00159                         DefaultTableModel cdsData = win.addContentTable(CDS_CMD, CDS_DESC, false);
00160                         cdsData.setColumnIdentifiers(cdsHeader);
00161                         BrokenCDS[] cds = asmScore.getBrokenCDS();
00162                         for (BrokenCDS bcds: cds)
00163                                 cdsData.addRow(bcds.toString().split("\t"));
00164                 }
00165         }
00166         
00167         public static String getSumText(AssemblyScorer assScore, boolean header, boolean singleLine){
00168                 NumberFormat nf = NumberFormat.getInstance();
00169                 nf.setMaximumFractionDigits(4);
00170                 StringBuilder sb = new StringBuilder();
00171                 if (singleLine){
00172                         if (header) {
00173                                 sb.append("Name\tNumContigs\tNumRefReplicons\tNumAssemblyBases\tNumReferenceBases\tNumLCBs\t" +
00174                                                 "DCJ_Distance\tNumDCJBlocks\tNumSNPs\tNumMisCalled\tNumUnCalled\tNumGapsRef\tNumGapsAssembly\t" +
00175                                                 "TotalBasesMissed\tPercBasesMissed\tExtraBases\tPercExtraBases" + 
00176                                                 "\tMissingChromosomes\tExtraContigs\tNumSharedBoundaries\tNumInterLcbBoundaries"+
00177                                                 "\tBrokenCDS\tIntactCDS\tContigN50\tContigN90\tMinContigLength\tMaxContigLength"+
00178                                                 "\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT\n");
00179                         }
00180                         sb.append(      assScore.getModel().getGenomeBySourceIndex(1).getDisplayName() + "\t" +
00181                                                 assScore.numContigs()+"\t"+assScore.numReplicons()+"\t"+
00182                                                 assScore.numBasesAssembly()+"\t"+assScore.numBasesReference()+"\t"+assScore.numLCBs()+"\t"+
00183                                                 assScore.getDCJdist()+"\t"+assScore.numBlocks()+"\t"+assScore.getSNPs().length+"\t"+
00184                                                 assScore.getMiscalled()+'\t'+assScore.getUncalled()+'\t'+
00185                                                 assScore.getReferenceGaps().length+"\t"+assScore.getAssemblyGaps().length+"\t"+
00186                                                 assScore.totalMissedBases()+"\t"+nf.format(assScore.percentMissedBases()*100)+"\t"+
00187                                                 assScore.totalExtraBases()+"\t"+nf.format(assScore.percentExtraBases()*100) +"\t"+
00188                                                 assScore.getMissingChromosomes().length+"\t"+assScore.getExtraContigs().length +"\t"+ 
00189                                                 assScore.getSharedBoundaryCount()+"\t"+assScore.getInterLcbBoundaryCount());
00190                         sb.append( "\t" + assScore.numBrokenCDS()+"\t"+assScore.numCompleteCDS());
00191                         sb.append( "\t" + assScore.getContigN50() + "\t" + assScore.getContigN90() + "\t" + assScore.getMinContigLength() + "\t" + assScore.getMaxContigLength());
00192                         int[][] subs = assScore.getSubs();
00193                         for(int i=0; i<subs.length; i++)
00194                         {
00195                                 for(int j=0; j<subs[i].length; j++)
00196                                 {
00197                                         sb.append('\t');
00198                                         sb.append(subs[i][j]);
00199                                 }
00200                         }
00201                         sb.append("\n");
00202                 } else {
00203                         if (header) {
00204                                 sb.append(
00205                                         "Number of Contigs:\t"+assScore.numContigs()+"\n"+
00206                                         "Number reference replicons:\t" + assScore.numReplicons()+"\n"+
00207                                         "Number of assembly bases:\t" + assScore.numBasesAssembly()+"\n"+
00208                                         "Number of reference bases:\t" + assScore.numBasesReference()+"\n"+
00209                                         "Number of LCBs:\t" + assScore.numLCBs()+"\n"+
00210                                         "Number of Blocks:\t"+assScore.numBlocks()+"\n"+
00211                                         "Breakpoint Distance:\t"+assScore.getBPdist()+"\n"+
00212                                         "DCJ Distance:\t"+assScore.getDCJdist()+"\n"+
00213                                         "SCJ Distance:\t"+assScore.getSCJdist()+"\n"+
00214                                         "Number of Complete Coding Sequences:\t"+assScore.numCompleteCDS()+"\n"+
00215                                         "Number of Broken Coding Sequences:\t"+assScore.numBrokenCDS()+"\n"+                                    
00216                                         "Number of SNPs:\t"+assScore.getSNPs().length+"\n"+
00217                                         "Number of Gaps in Reference:\t"+assScore.getReferenceGaps().length+"\n"+
00218                                         "Number of Gaps in Assembly:\t"+assScore.getAssemblyGaps().length+"\n" +
00219                                         "Total bases missed in reference:\t" + assScore.totalMissedBases() +"\n"+
00220                                         "Percent bases missed:\t" + nf.format(assScore.percentMissedBases()*100)+" %\n"+
00221                                         "Total bases extra in assembly:\t" + assScore.totalExtraBases()+"\n" +
00222                                         "Percent bases extra:\t" + nf.format(assScore.percentExtraBases()*100)+ " %\n"+
00223                                         "Number of missing chromosomes:\t" + assScore.getMissingChromosomes().length +"\n"+
00224                                         "Number of extra contigs:\t"+assScore.getExtraContigs().length +"\n"+
00225                                         "Number of Shared Boundaries:\t"+assScore.getSharedBoundaryCount()+"\n"+
00226                                         "Number of Inter-LCB Boundaries:\t"+assScore.getInterLcbBoundaryCount()+"\n"+
00227                                         "Contig N50:\t"+assScore.getContigN50()+"\n"+
00228                                         "Contig N90:\t"+assScore.getContigN90()+"\n"+
00229                                         "Min contig length:\t"+assScore.getMinContigLength()+"\n"+
00230                                         "Max contig length:\t"+assScore.getMaxContigLength()+"\n"+
00231                                         "Substitutions (Ref on Y, Assembly on X):\n"+AssemblyScorer.subsToString(assScore)
00232                                 );
00233                                 
00234                                 
00235                         } else { 
00236                                 sb.append(
00237                                                 assScore.numContigs()+"\n"+
00238                                                 assScore.numLCBs()+"\n"+
00239                                                 assScore.getDCJdist()+"\n"+
00240                                                  assScore.numBlocks()+"\n"+
00241                                                  assScore.getSNPs().length+"\n"+
00242                                                  assScore.getReferenceGaps().length+"\n"+
00243                                                  assScore.getAssemblyGaps().length+"\n" +
00244                                                  assScore.totalMissedBases() +"\n"+
00245                                                  nf.format(assScore.percentMissedBases()*100)+"\n"+
00246                                                  assScore.totalExtraBases()+"\n" +
00247                                                  nf.format(assScore.percentExtraBases()*100)+ "\n"+
00248                                                  assScore.getMissingChromosomes() +"\n"+
00249                                                 assScore.getExtraContigs()+"\n"+
00250                                                 "Number of Shared Boundaries:\t"+assScore.getSharedBoundaryCount()+"\n"+
00251                                                 "Number of Inter-LCB Boundaries:\t"+assScore.getInterLcbBoundaryCount()+"\n"+
00252                                                  AssemblyScorer.subsToString(assScore));
00253                         }
00254                 }
00255                 return sb.toString();
00256         }
00257 
00258         public static void launchWindow(MauveFrame frame){
00259                 BaseViewerModel model = frame.getModel();
00260                 String key = model.getSrc().getAbsolutePath();
00261                 if (modelMap.containsKey(key)){
00262                 //      modelMap.get(key).win.showWindow();
00263                         startNewSA(frame);
00264                 } else if (model instanceof XmfaViewerModel) {
00265                         startNewSA(frame);                      
00266                 } else {
00267                         System.err.println("Can't score assembly unless I have an" +
00268                                                         " XmfaViewerModel -- Please report this bug!");
00269                 }       
00270         }
00271         
00272         private static void startNewSA(MauveFrame frame){
00273                 interactive(frame);
00274         }
00275 
00276         public static void interactive(MauveFrame frame){
00277                 XmfaViewerModel model = (XmfaViewerModel) frame.getModel();
00278                 String key = model.getSrc().getAbsolutePath();
00279                 if (running.containsKey(key)) {
00280                         int ret_val = JOptionPane.showConfirmDialog(frame, "I'm currently"+
00281                                         "running the scoring pipeline for this assembly. Do you " +
00282                                         "want me to restart it?");
00283                         if (ret_val == JOptionPane.YES_OPTION){
00284                                 running.get(key).cancel();
00285                                 running.remove(key).win.closeWindow();
00286                                 startNewSA(frame);
00287                         } else if (ret_val == JOptionPane.NO_OPTION){
00288                                 running.get(key).win.showWindow();
00289                         }
00290                 } else {
00291                                 running.put(key, 
00292                                                 new ScoreAssembly(model));
00293                 }
00294         }
00295         
00296         public static void main(String[] args){
00297                 CommandLine line = OptionsBuilder.getCmdLine(getOptions(), args);
00298                 if (args.length == 0 || line == null || line.hasOption("help")){
00299                         HelpFormatter formatter = new HelpFormatter();
00300                         formatter.printHelp(80,
00301                                         "java -cp Mauve.jar org.gel.mauve.assembly.ScoreAssembly [options]",
00302                                         "[options]", getOptions(), "report bugs to Aaron Darling <aarondarling@ucdavis.edu>");
00303                         System.exit(-1);
00304                 }
00305         
00306                 File outDir =  null;
00307                 if (line.hasOption("outputDir"))
00308                         outDir = new File(line.getOptionValue("outputDir"));
00309                 else 
00310                         outDir = new File(System.getProperty("user.dir"));
00311                 
00312                 boolean batch = false;
00313                 if (line.hasOption("batch"))
00314                         batch = true;
00315                 String basename = null;
00316                 if (line.hasOption("basename"))
00317                         basename = line.getOptionValue("basename");
00318                 
00319                 File alnmtFile = null;
00320                 AssemblyScorer as = null;
00321                 
00322                 if (line.hasOption("alignment")){ // if we don't need to align
00323                         if ((line.hasOption("reference") || line.hasOption("assembly"))){
00324                                 System.err.println("You gave me an alignment along with a reference and/or assembly genome.\n" +
00325                                 "Do not use use the \"-reference\" and \"-assembly\" flags with the \"-alignment\" flag.");
00326                                 System.exit(-1);
00327                         }
00328                         alnmtFile = new File (line.getOptionValue("alignment"));
00329                         if (basename == null) {
00330                                 basename = alnmtFile.getName();
00331                                 // AED: can't truncate to last . because the file may not have a dot in the name
00332                         }
00333                         as = getAS(alnmtFile);
00334                         AssemblyScorer.printInfo(as,outDir,basename,batch);
00335                 } else { // we need to do some sort of alignment
00336                         
00337                         if (!line.hasOption("reference")){
00338                                 System.err.println("Reference file not given.");
00339                                 System.exit(-1);
00340                         } else if (!line.hasOption("assembly")){
00341                                 System.err.println("Assembly file not given.");
00342                                 System.exit(-1);
00343                         }
00344                         
00345                         File refFile = new File(line.getOptionValue("reference"));
00346                         File assPath = new File(line.getOptionValue("assembly"));
00347                         
00348                         if (line.hasOption("reorder")){ // we need to reorder first
00349                                 String[] reorderParams = new String[8];
00350                                 reorderParams[0] = "-output";
00351                                 reorderParams[1] = outDir.getAbsolutePath();
00352                                 reorderParams[2] = "-ref";
00353                                 reorderParams[3] = refFile.getAbsolutePath();
00354                                 reorderParams[4] = "-draft";
00355                                 reorderParams[5] = assPath.getAbsolutePath();
00356                                 // Add the following two options when doing a reorder for alignment scoring
00357                                 // These tune the aligner toward high sequence identity, which we
00358                                 // expect when scoring an assembly against a reference
00359                                 reorderParams[6] = "--seed-weight=25";
00360                                 reorderParams[7] = "--solid-seeds";
00361                                 ContigOrderer co = new ContigOrderer(reorderParams, null, false);
00362                                 if (basename != null)
00363                                         as = new AssemblyScorer(co, outDir, basename);
00364                                 else 
00365                                         as = new AssemblyScorer(co, outDir);
00366                                 co.addAlignmentProcessListener(as);
00367                         } else {
00368                                 if (basename == null) {
00369                                         basename = assPath.getName();
00370                                 }
00371                                 alnmtFile = new File(outDir, basename+".xmfa");
00372                                 as = new AssemblyScorer(alnmtFile, outDir, basename);
00373                                 String[] cmd = makeAlnCmd(refFile,assPath, alnmtFile);
00374                                 System.out.println("Executing");
00375                                 AlignFrame.printCommand(cmd, System.out);
00376                                 AlignWorker worker = new AlignWorker(as, cmd);
00377                                 worker.start();
00378                         }
00379                         
00380                 }
00381         }
00382 
00383         private static AssemblyScorer getAS(File alnmtFile){
00384                 try {
00385                         //sa = new ScoreAssembly(args,true);
00386                         XmfaViewerModel model = new XmfaViewerModel(alnmtFile,null);
00387                         return new AssemblyScorer(model);
00388                 } catch (Exception e){
00389                         e.printStackTrace();
00390                         System.exit(-1);
00391                 }
00392                 return null;
00393         }
00394 
00395         private static String[] makeAlnCmd(File seq1, File seq2, File output){
00396                 String[] ret = new String[6 + DEFAULT_ARGS.length];
00397                 int j = 0;
00398                 ret[j++] = AlignFrame.getBinaryPath("progressiveMauve");
00399                 for (int i = 0; i < DEFAULT_ARGS.length; i++)
00400                         ret[j++] = DEFAULT_ARGS[i];
00401                 ret[j++] = "--output="+output.getAbsolutePath();
00402                 ret[j++] = "--backbone-output=" + output.getAbsolutePath()+".backbone";
00403                 ret[j++] = "--output-guide-tree=" + output.getAbsolutePath()+".guide_tree";
00404                 ret[j++] = seq1.getAbsolutePath();
00405                 ret[j++] = seq2.getAbsolutePath();
00406                 return ret;
00407         }
00408 
00409         @SuppressWarnings("static-access")
00410         private static Options getOptions(){
00411                 Options ret = new Options();
00412                 OptionsBuilder ob = new OptionsBuilder();
00413                 ob.addBoolean("help", "print this message");
00414                 ob.addBoolean("batch", "run in batch mode i.e. print summary output " +
00415                                                                                         "on one line to standard output");
00416                 ob.addBoolean("reorder", "reorder contigs before scoring the assembly");
00417                 ob.addArgument("string", "basename for output files", "basename",false);
00418                 ob.addArgument("directory", "save output in <directory>. Default " +
00419                                                                         "is current directory.", "outputDir",true);
00420                 ob.addArgument("file", "file containing alignment of assembly to " +
00421                                                                                         "reference genome", "alignment",false);
00422                 ob.addArgument("file", "file containing reference genome", "reference",false);
00423                 ob.addArgument("file", "file containing assembly/draft genome to score",
00424                                                                                                                                         "assembly",false);
00425                 return ob.getOptions();
00426         }
00427 }

Generated on Mon Aug 19 06:03:40 2013 for Mauve by doxygen 1.3.6